def clear_un_img(): # 图片存放的路径 all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img") # 这个目录下是需要保留的图片 leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect") if FileUtil.isempty(leave_img_url): FileUtil.empty(all_img_url) else: all_imgs = FileUtil.listdir(all_img_url) dirs = [leave_img_url] for parent, dirnames, filenames in os.walk(leave_img_url): for dirname in dirnames: dirs.append(os.path.join(parent, dirname)) leave_imgs = [] for dir_ in dirs: imglist = collect.read_weibo(dir_, isreadimg=True) imglist = flatten( [img.get("img") for img in imglist if img.get("img")]) leave_imgs += imglist # 删除多余的图片 map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)
def read_train(self, path): def handle_read(datas): l = [] d = dict() for data in datas: if data.startswith("sentence"): d = dict() d["sentence"] = data[data.find(":") + 1:] l.append(d) elif data.startswith("img"): d["img"] = filter(lambda x: x, data[data.find(":") + 1:].split(",")) elif data.startswith("label"): d["label"] = data[data.find(":") + 1:] return l path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join(RESOURCE_BASE_URL, path) filenames = FileUtil.listdir(path, isrecursion=False) return flatten([CommonUtil.read_from_file(filename, handle_read) for filename in filenames])
def read_weibo(path, isreadimg=False): def handle_read(datas): fit_datas = datas if not isreadimg: fit_datas = [data for data in datas if not data.startswith("img")] l = [] d = dict() for data in fit_datas: if data.startswith("sentence"): d = dict() d["sentence"] = data[data.find(":") + 1:] l.append(d) elif data.startswith("img"): d["img"] = filter(lambda x: x, data[data.find(":") + 1:].split(",")) return l path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join(RESOURCE_BASE_URL, path) filenames = FileUtil.listdir(path, isrecursion=False) return flatten([read_from_file(filename, handle_read) for filename in filenames])
def read_train(self, path): def handle_read(datas): l = [] d = dict() for data in datas: if data.startswith("sentence"): d = dict() d["sentence"] = data[data.find(":") + 1:] l.append(d) elif data.startswith("img"): d["img"] = filter(lambda x: x, data[data.find(":") + 1:].split(",")) elif data.startswith("label"): d["label"] = data[data.find(":") + 1:] return l path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join( RESOURCE_BASE_URL, path) filenames = FileUtil.listdir(path, isrecursion=False) return flatten([ CommonUtil.read_from_file(filename, handle_read) for filename in filenames ])
def read_weibo(path, isreadimg=False): def handle_read(datas): fit_datas = datas if not isreadimg: fit_datas = [data for data in datas if not data.startswith("img")] l = [] d = dict() for data in fit_datas: if data.startswith("sentence"): d = dict() d["sentence"] = data[data.find(":") + 1:] l.append(d) elif data.startswith("img"): d["img"] = filter(lambda x: x, data[data.find(":") + 1:].split(",")) return l path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join( RESOURCE_BASE_URL, path) filenames = FileUtil.listdir(path, isrecursion=False) return flatten( [read_from_file(filename, handle_read) for filename in filenames])
def clear_un_img(): # 图片存放的路径 all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img") # 这个目录下是需要保留的图片 leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect") if FileUtil.isempty(leave_img_url): FileUtil.empty(all_img_url) else: all_imgs = FileUtil.listdir(all_img_url) dirs = [leave_img_url] for parent, dirnames, filenames in os.walk(leave_img_url): for dirname in dirnames: dirs.append(os.path.join(parent, dirname)) leave_imgs = [] for dir_ in dirs: imglist = collect.read_weibo(dir_, isreadimg=True) imglist = flatten([img.get("img") for img in imglist if img.get("img")]) leave_imgs += imglist # 删除多余的图片 map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)
def count_img(): # 图片存放的路径 all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img") print "It's have %d images" % len(FileUtil.listdir(all_img_url))