def generate_conf(): """ 如果配置文件不存在,则生成 :return: """ conf = os.path.join(os.getcwd(), "app.conf") if not os.path.exists(conf): content = "# 全局配置\n" content += "[APP]\n" content += "# 数据存放目录\n" content += "DATA_DIR = \n" content += "# 使用哪种数据库:MySQL或者Sqlite3\n" content += "DATABASE = \n" content += "# MySQL配置\n" content += "[MYSQL]\n" content += "# 地址\n" content += "HOST = \n" content += "# 端口\n" content += "PORT = \n" content += "# 用户名\n" content += "USER = \n" content += "# 密码\n" content += "PASSWORD = \n" content += "# 数据库名称\n" content += "BD_NAME = \n" content += "# 字符编码\n" content += "CHARSET = \n" content += "# Sqlite3配置\n" content += "[SQLITE3]\n" content += "# 数据库名称\n" content += "BD_NAME = \n" content += "# 字符编码\n" content += "CHARSET = \n" FileUtil.writ_file(conf, content)
def download_chromedriver(): """ 下载chrome驱动 http://chromedriver.storage.googleapis.com/index.html :return: """ # 获取版本号列表 url = "http://chromedriver.storage.googleapis.com/" result = BeautifulSoup(HttpUtil.get(url, { "delimiter": "/", "prefix": "" }).text, features="lxml") prefix = result.find_all("prefix") # 过滤 # info = [s.extract() for s in prefix('prefix')] local_version = get_local_version(prefix) # 获取版本下面的文件列表 driver_list = BeautifulSoup(HttpUtil.get(url, { "delimiter": "/", "prefix": local_version }).text, features="lxml") filename_list = driver_list.find_all("key") for s in filename_list: s = s.text # 如果在文件名中找到系统平台名称 if s.find(sys.platform) != -1: filename = s[len(local_version):] # 下载文件 HttpUtil.download_file(url + s, None, filename) FileUtil.zip_extract(filename, None)
def download_taobao_chromedriver(): """ 下载淘宝镜像chromedriver http://npm.taobao.org/mirrors/chromedriver :return: """ # 获取版本号列表 url = "http://npm.taobao.org/mirrors/chromedriver/" result = BeautifulSoup(HttpUtil.get(url).text, features="lxml") prefix = result.find("pre").find_all("a") # 过滤 # info = [s.extract() for s in prefix('prefix')] local_version_url = url + get_local_version(prefix) # 获取版本下面的文件列表 driver_list = BeautifulSoup(HttpUtil.get(local_version_url).text, features="lxml") filename_list = driver_list.find_all("a") for s in filename_list: s = s.text # 如果在文件名中找到系统平台名称 if s.find(sys.platform) != -1: # 下载文件 HttpUtil.download_file(local_version_url + s, None, s) FileUtil.zip_extract(s, None)
def getSVC(target, labels, train_new): #如果不是重新train,就读取以前的数据 if not train_new: classifier = svm.SVC(kernel='rbf', C=2, gamma=0.001, probability=True) classifier = joblib.load(Constants.SKL_DATA_FILE_NAME) return classifier # First half is trainData, remaining is testData train_cells = target ###### Now training ######################## deskewed = [map(deskew, row) for row in train_cells] hogdata = [map(hog, row) for row in deskewed] trainData = np.float32(hogdata).reshape(-1, 64) print(trainData.shape) #response is array of n * 1, i.e. [[1],[2],[3]...] responses = np.int8(labels) #save target and labels to file FileUtil.writeTarget(trainData) FileUtil.writeLabels(responses) # Create a classifier: a support vector classifier ''' Use params in GridSearch ''' #classifier = svm.SVC(gamma=0.001,probability=True) classifier = svm.SVC(kernel='rbf', C=2, gamma=0.001, probability=True) # We learn the digits on the first half of the digits print(trainData.shape) print(responses.shape) classifier.fit(trainData, labels) #dump train data joblib.dump(classifier, Constants.SKL_DATA_FILE_NAME) return classifier
def download_and_save_abstracts_for_search_term(search_term, dataset, max_ids): abstracts_df = retrieve_pubmed_abstracts([search_term], max_ids) dataset_output_directory = os.path.join(global_output_directory_name, dataset) FileUtil.create_directory_if_not_exists(dataset_output_directory) abstracts_df.to_csv(os.path.join(dataset_output_directory, 'abstracts.csv')) return abstracts_df
def update_hosts(new_hosts): """ 更新覆写hosts :param new_hosts: 新的dns数组 :return: """ FileUtil.remove_read_only(Constants.HOSTS_PATH) FileUtil.write_lines(Constants.HOSTS_PATH, new_hosts) # 刷新dns os.system("ipconfig /flushdns")
def run_command(directory): dir_size = FileUtil.count_dir_size(directory) if dir_size >= 107374182400: print(FileUtil.size_unit_format(dir_size)) print( os.system( "rclone move /home/reptile-python/images/ gdrive:/images --min-size 100k" )) print(FileUtil.size_unit_format(FileUtil.count_dir_size(directory))) print(os.popen("rclone dedupe gdrive:/images --dedupe-mode newest").read()) print(os.popen("rclone delete gdrive:/images --max-size 100k").read()) threading.Timer(21600, run_command, (directory, )).start()
def filePath(self): """ :return: """ fileName = os.path.join(os.getcwd(), "content.txt") self.assertEqual(os.getcwd(), FileUtil.dirname(fileName))
def get_bing(): """ 获取必应图片地址 :return: """ data = {'format': 'js', 'idx': 0, 'n': 1} try: response = HttpUtil.get_json( url='http://cn.bing.com/HPImageArchive.aspx', data=data) logging.debug(response) except Exception as e: logging.error("网络请求错误:", e) time.sleep(120) get_bing() images = response["images"] url = "http://cn.bing.com" + images[0]["url"].split("&")[0] # 拼接目录路径 directory = os.path.join(Constants.APP_DIRECTORY, "images") image_name = url.split("=")[1] # 拼接文件绝对路径 image_path = os.path.join(directory, image_name) # 下载图片 HttpUtil.download_file(url, directory, image_name) # 分割文件名和后缀,如果后缀不为bmp if os.path.splitext(image_name)[1] != "bmp": # 转为bmp image_path = FileUtil.image_to_bmp(image_path)
def find_and_save_food_disease_dfs(self, ids_and_abstracts, dataset): save_directory = os.path.join(global_output_directory_name, dataset) FileUtil.create_directory_if_not_exists(save_directory) for extractor in self.food_extractors + self.disease_extractors: print(extractor.name) df_to_save = pd.DataFrame() i = 0 save_file = os.path.join( save_directory, '{extractor_name}.csv'.format(extractor_name=extractor.name)) if not os.path.isfile(save_file): for (file_name, file_content) in ids_and_abstracts: doc = self.english_model(file_content) #print(i) i += 1 file_name = str(file_name) try: extracted_df = extractor.extract(doc, file_name, self.dataset, save_entities=False) extracted_df['extractor'] = extractor.name extracted_df['file_name'] = file_name df_to_save = df_to_save.append(extracted_df) except: if self.verbose: print('Error happened') traceback.print_exc(file=sys.stdout) if i % 1000 == 0: df_to_save.drop_duplicates().to_csv( os.path.join( save_directory, '{extractor_name}_{i}.csv'.format( extractor_name=extractor.name, i=i))) if df_to_save.shape[0] == 0: df_to_save = pd.DataFrame(columns=[ 'start_char', 'end_char', 'entity_type', 'entity_id', 'text', 'sentence', 'sentence_index', 'extractor', 'file_name' ]) df_to_save.drop_duplicates().to_csv(save_file) else: print('File already exists: {0}'.format(save_file))
def delete_dns(dns): """ 删除数组中的dns :param dns: 数组 :return: 删除后的hosts """ hosts = FileUtil.read_file(Constants.HOSTS_PATH) new_hosts = [] for host in hosts: if not ObjectUtil.is_in_array(host.strip("\n"), dns): new_hosts.append(host) return new_hosts
def init(): """ 初始化 :return: """ app = FileUtil.Config("app.conf") data_dir = app.get("APP", "DATA_DIR") if data_dir == "" or data_dir is None: raise ValueError("请配置数据存放目录!") # 如果目录不存在 if not os.path.exists(data_dir): # 父级目录 parent_path = os.path.dirname(os.path.dirname(__file__)) # 拼接目录 data_dir = os.path.join(parent_path, data_dir) # 创建目录 os.mkdir(data_dir) database = app.get("APP", "DATABASE") if database == "" or database is None or (database.lower() != "mysql" and database.lower() != "sqlite3"): raise ValueError("请配置使用的数据库!") if database.lower() == "mysql": database = database.upper() host = app.get(database, "HOST") if host == "" or host is None: raise ValueError("请配置MySQL数据库地址!") port = app.get(database, "PORT") if port == "" or port is None: raise ValueError("请配置MySQL数据库端口!") user = app.get(database, "USER") if user == "" or user is None: raise ValueError("请配置MySQL数据库用户名!") password = app.get(database, "PASSWORD") if password == "" or password is None: raise ValueError("请配置MySQL数据库密码!") db_name = app.get(database, "BD_NAME") if db_name == "" or db_name is None: raise ValueError("请配置MySQL数据库名称!") charset = app.get(database, "CHARSET") if charset == "" or charset is None: raise ValueError("请配置MySQL数据库字符编码!") if database.lower() == "sqlite": database = database.upper() db_name = app.get(database, "BD_NAME") if db_name == "" or db_name is None: raise ValueError("请配置Sqlite3数据库名称!") charset = app.get(database, "CHARSET") if charset == "" or charset is None: raise ValueError("请配置Sqlite3数据库字符编码!")
def __init__(self, configFile='redis.properties'): """ 初始化 默认: redis.properties :param configFile: 文件名 """ config = FileUtil.Properties(configFile) password = None if not CommonUtil.isEmpty(config.get('pass')): password = config.get('pass') super(RedisClient, self).__init__(host=config.get('host'), port=config.get('port', 6379), password=password)
def generate_conf(): """ 如果配置文件不存在,则生成 :return: """ conf = os.path.join(os.getcwd(), "app.conf") if not os.path.exists(conf): content = """\ # 全局配置 [APP] # 数据存放目录 DATA_DIR = # 使用哪种数据库:MySQL或者Sqlite3 DATABASE = # MySQL配置 [MYSQL] # 地址 HOST = # 端口 PORT = # 用户名 USER = # 密码 PASSWORD = # 数据库名称 BD_NAME = # 字符编码 CHARSET = # Sqlite3配置 [SQLITE3] # 数据库名称 BD_NAME = # 字符编码 CHARSET = """ FileUtil.writ_file(conf, content)
def get_statistic_row(self, _rows, _pc, _description, _batch_no, _curr_path, _root_path, _level, logger): """ 递归获取文件夹子文件夹的信息 @author [email protected] :param _rows: :param _pc: pc mark :param _description: root dir mark :param _batch_no: :param _curr_path: :param _root_path: :param _level: :param logger: :return: """ _logger = logger.getChild('get_statistic_row') file_num = 0 dir_num = 0 file_size = 0 zero_num = 0 _file_num = 0 # 文件数 _dir_num = 0 # 文件夹数 _file_size = 0 # 文件大小和 _zero_num = 0 # 空文件数 _row = [] _logger.debug("start - %s" % _curr_path) for loop_dir_path, loop_dir_names, loop_file_names in os.walk( _curr_path, followlinks=False): # _file_num = loop_file_names.__len__() #会统计快捷方式 # _dir_num = loop_dir_names.__len__() #会统计快捷方式 if len(loop_file_names) > 0: for child_file_name in loop_file_names: temp_path = os.path.join(loop_dir_path, child_file_name) ext = os.path.splitext(temp_path)[1][1:].lower() if ext != 'lnk' and os.path.isfile(temp_path): _file_num += 1 _temp_size = os.path.getsize(temp_path) _file_size += _temp_size if _temp_size == 0: _zero_num += 1 if len(loop_dir_names) > 0: for child_dir_name in loop_dir_names: temp_path = os.path.join(loop_dir_path, child_dir_name) real_path = os.path.realpath(temp_path) if temp_path == real_path: arr = self.get_statistic_row(_rows, _pc, _description, _batch_no, temp_path, _root_path, _level + 1, logger) file_num += arr[0] dir_num += arr[1] file_size += arr[2] zero_num += arr[3] _dir_num += 1 file_num += _file_num dir_num += _dir_num file_size += _file_size zero_num += _zero_num break if _level <= self.max_level: _dir_name = os.path.basename(_curr_path) _dir_path = os.path.abspath(_curr_path) _parent_path = os.path.abspath( os.path.dirname(_curr_path) + os.path.sep + ".") _row.append(_pc) _row.append(_description) _row.append(_batch_no) _row.append(_dir_name) _row.append(_dir_path) _row.append(_parent_path) _row.append(_root_path) _row.append(_level) _row.append(_file_num) _row.append(file_num) _row.append(_dir_num) _row.append(dir_num) _row.append(FileUtil.format2MB(_file_size)) _row.append(FileUtil.format2MB(file_size)) _row.append(_zero_num) _row.append(zero_num) _row.append((datetime.now()).strftime('%Y-%m-%d %H:%M:%S')) _rows.append(_row) self.total_num += 1 if _rows.__len__() == 500: temp_rows = _rows.copy() CsvUtil.write(self.work_csv, [], temp_rows) self.logger.info("csv export 500.") _rows.clear() arr = [file_num, dir_num, file_size, zero_num] _logger.debug("end - %s ,%s" % (_curr_path, _row)) self.trace_num += 1 if self.trace_num % 10000 == 0: _logger.info("current trace num: %d %s" % (self.trace_num, _curr_path)) return arr
def clear(event): nonlocal draw_data event.widget.delete("all") draw_data = [[0] * 28 for _ in range(28)] def done(_): master.destroy() master = tk.Tk() canvas = tk.Canvas(master, width=560, height=560) canvas.pack() canvas.bind('<ButtonPress-1>', draw) canvas.bind('<B1-Motion>', draw) canvas.bind('<Double-1>', clear) canvas.bind('<ButtonPress-2>', done) master.mainloop() return np.array([a for b in draw_data for a in b]) if __name__ == "__main__": dataset_index = -3 data = fUtil.load_data(fUtil.TRAINING_DATA_NAMES[dataset_index]) name = fUtil.TRAINING_DATA_NAMES[dataset_index][:1].upper() + fUtil.TRAINING_DATA_NAMES[dataset_index][1:-4] show_image(data[0]) show_image(data, bulk_size=49, name=name) data = show_drawable_canvas() print(data) show_image(data)
# Option: save training array to cache SAVE_TO_CACHE = False # Option: save trained neural network data SAVE_NEURAL_NETWORK = False # Option: skip the training of the neural network (debugging) SKIP_TRAINING = True # Code starts if not LOAD_FROM_CACHE: # Load training / testing data print("Loading training and testing data...") training, testing = [], [] # Array of tuples: (image, answer) category_count = len(fUtil.TRAINING_DATA_NAMES) for category in fUtil.TRAINING_DATA_NAMES: # Print the progress print(">> Loading " + fUtil.get_name(fUtil.get_index(category)) + "... (" + str(fUtil.get_index(category) + 1) + "/" + str(category_count) + ")") # Normalize the data for more efficiency data = fUtil.load_data(category, normalize=True) # Split the data into training data and testing data train_limit = int(len(data) * TRAIN_TEST_RATIO) index = fUtil.get_index(category) # Append the current data to master data list training += [(image_data, [1 if a == index else 0 for a in range(category_count)]) for image_data in data[:train_limit]] testing += [(image_data, [1 if a == index else 0 for a in range(category_count)]) for image_data in data[train_limit:]]
def testWrite(self): """ :return: """ fileName = os.path.join(os.getcwd(), "content.txt") exist = FileUtil.existFile(fileName) if exist: FileUtil.delFile(fileName) FileUtil.writeContent(fileName, "HelloWorld") FileUtil.writeContent(fileName, "HelloWorld") content = FileUtil.readContent(fileName) self.assertEqual(content, "HelloWorldHelloWorld") FileUtil.delFile(fileName) self.assertFalse(FileUtil.existFile(fileName))
def save(self, doc: Doc, objects: List, file_name: str, file_subdirectory: str): output_directory = self.get_output_directory(file_subdirectory) FileUtil.create_directory_if_not_exists(output_directory) doc, objects_column_names = self.prepare_doc_for_saving(doc, objects) doc.to_disk(f'{output_directory}/{file_name}') PandasUtil.write_object_list_as_dataframe_file(doc._.entities, file_name, f'{output_directory}/as_df', columns=objects_column_names)