def makeDividedSets(): ctgInPart = 35 import os from math import ceil from utils import readLines, writeLines from verifier import splitFullCategory categories = readLines(Path.categories) divisions = ceil(len(categories) / ctgInPart) categories = [categories[i * ctgInPart:(i + 1) * ctgInPart] for i in range(divisions)] for i, ctgList in enumerate(categories): pathsList = [] for ctg in ctgList: category, subcategory = splitFullCategory(ctg) originalPath = os.path.join(Path.dataset, Constants.original, category, subcategory) augmentedPath = os.path.join(Path.dataset, Constants.augmented, category, subcategory) pathsList.extend([originalPath, augmentedPath]) setPath = os.path.join(Path.sets, f"part_{i}") makeSets(pathsList, wpath=setPath, trainPart=0.9, validPart=0.05) writeLines(ctgList, os.path.join(setPath, "set_categories.txt"))
def processVideoFolder(folderPath=Path.rawVideos, marksPath=Path.rawJson, datasetPath=Path.dataset, overwrite=False, extension=Extensions.jpg, params=None): processedVideos = readLines(Path.processedFiles) videos = [ video for video in os.listdir(folderPath) if video not in processedVideos and ( video.endswith(Extensions.mov) or video.endswith(Extensions.mp4)) ] actualInfo = downloadActualInfo() for video in videos: actualizeInfoWithFrames(Path.dataset) filePath = os.path.join(folderPath, video) print( f"\n{Fore.GREEN}Video {filePath} is being processed {Style.RESET_ALL}" ) frameVideo(filePath=filePath, marksPath=marksPath, datasetPath=datasetPath, actualInfo=actualInfo, overwrite=overwrite, extension=extension, params=params) processedVideos.append(video) writeLines(set(processedVideos), Path.processedFiles)
def purifySets(): sets = { const.train: os.path.join(Path.sets, extendName(const.train, Extensions.txt)), const.valid: os.path.join(Path.sets, extendName(const.valid, Extensions.txt)), const.test: os.path.join(Path.sets, extendName(const.test, Extensions.txt)), } for set_, path in sets.items(): files = readLines(path) total = len(files) files = [f for f in files if os.path.exists(f)] writeLines(files, path) print(f"Cleaned {total - len(files)} from {path}")
def makeCategoriesList(summarizedPath=Path.summarizedRaw, allowedSubCtgList=None): from utils import openJsonSafely, writeLines from verifier import getFullCategory summarized = openJsonSafely(summarizedPath) ctgList = [] for ctg, value in summarized.items(): if ctg == const.maxIdx: continue for subctg in value: if allowedSubCtgList is not None and subctg not in allowedSubCtgList: continue idx = value[subctg][const.ctgIdx] ctgList.append((getFullCategory(ctg, subctg), idx)) ctgList = [ctg for ctg, _ in sorted(ctgList, key=lambda x: x[1])] writeLines(ctgList, Path.categories)
def makeSets(directories, wpath=Path.sets, trainPart=0.9, validPart=0.05, ignoreOld=False, matchWithMarks=True): assert 0 < trainPart + validPart <= 1 os.makedirs(wpath, exist_ok=True) testPart = 1 - trainPart - validPart sets = { const.train: { "path": os.path.join(wpath, extendName(const.train, Extensions.txt)), "part": trainPart, "content": [] }, const.valid: { "path": os.path.join(wpath, extendName(const.valid, Extensions.txt)), "part": validPart, "content": [] }, const.test: { "path": os.path.join(wpath, extendName(const.test, Extensions.txt)), "part": testPart, "content": [] } } inUse = [] for set_, info in sets.items(): info["content"] = readLines(info["path"]) if not ignoreOld else [] inUse.extend(info["content"]) images = [] marks = [] for dirIdx, path in enumerate(directories): print( "\rSearching for images and marks in listed directories, {:.1f}% has been done" .format(dirIdx / len(directories) * 100), end="") dirImages = [ os.path.join(path, *img) for img in walk( path, targetExtensions=Extensions.images()).get("extensions") ] images.extend(dirImages) if matchWithMarks: dirMarks = [ os.path.join(path, *mrk) for mrk in walk( path, targetExtensions=Extensions.txt).get("extensions") ] marks.extend(dirMarks) if matchWithMarks: transformer = lambda x: changeExtension(x, Extensions.txt) print("Matching images to marks, please wait...") images = matchLists(master=marks, slave=images, transformer=transformer) # _, images = matchLists(master=inUse, slave=images, getMismatched=True) images = permutate(images) start = 0 for set_, info in sets.items(): part = info["part"] end = start + int(part * len(images)) total = end - start info["content"].extend(images[start:end]) info["content"] = permutate(info["content"]) start = end writeLines(lines=info["content"], path=info["path"]) print(f"\n{Fore.GREEN}Added {total} paths to {set_} {Style.RESET_ALL}")
def frameVideo(filePath, marksPath, datasetPath, actualInfo, overwrite=False, extension=Extensions.jpg, params=None, ctgLimit=None): categories = readLines(Path.categories) basename = extractBasename(filePath) try: jsonName = makeJSONname(basename) marks = json.load(open(os.path.join(marksPath, jsonName), "r")) except: print( f"{Fore.RED}There is no json file {marksPath} for {filePath} {Style.RESET_ALL}" ) return framesGenerator = generateFrames(filePath) offset = getKeysOffset(marks.keys()) marksSeparated = {} total = 0 for idx, frame in enumerate(framesGenerator): # if idx == 20: # break frameMarks = getFrameMarks(idx, marks, offset) if not frameMarks: continue category = frameMarks[const.category] subcategory = frameMarks[const.subcategory] countKeys = [const.original, category, subcategory] if idx == 0: globalIdx = getNested(dictionary=actualInfo, keys=countKeys, default=0) localIdx = idx + globalIdx if ctgLimit is not None and localIdx == ctgLimit: break frameID = f"frame_{localIdx}" fullCategory = getFullCategory(category, subcategory) if fullCategory not in categories: categories.append(fullCategory) ctgIdx = categories.index(fullCategory) frameName = f"{fullCategory}{const.separator}{frameID}{const.separator}{const.original}" dirPath = os.path.join(datasetPath, const.original, category, subcategory) framesPath = os.path.join(dirPath, const.frames) framePath = os.path.join(framesPath, extendName(frameName, extension)) updateNested(dictionary=actualInfo, keys=countKeys, value=1) if not overwrite and os.path.exists(framePath): print("\rFrame #{} has been passed".format(idx), end="") continue os.makedirs(framesPath, exist_ok=True) frameInfo = { const.image: extendName(frameName, extension), const.coords: fitCoords(frameMarks[const.coords], frame.shape[:2]), const.fullCategory: fullCategory, const.ctgIdx: ctgIdx, const.imageShape: frame.shape[:2] } keySet = countKeys + [ frameName ] # ["original", category, subcategory, frameName] putNested(dictionary=marksSeparated, keys=keySet, value=frameInfo) cv2.imwrite(framePath, frame, params) total += 1 print("\rFrame #{} has been added".format(idx), end="") marksSeparated = marksSeparated[const.original] print() for ctg, value in marksSeparated.items(): for subctg, subctgMarks in value.items(): subctgMarksJson = os.path.join( datasetPath, const.original, ctg, subctg, extendName(const.marks, Extensions.json)) oldMarks = openJsonSafely(subctgMarksJson) for k, v in subctgMarks.items(): oldMarks[k] = v json.dump(oldMarks, open(subctgMarksJson, "w"), indent=3) print( f"{Fore.GREEN}Added marks to {subctgMarksJson} {Style.RESET_ALL}" ) writeLines(categories, Path.categories) print( f"{Fore.GREEN}Updated categories file {Path.categories} {Style.RESET_ALL}" ) print(f"{Fore.GREEN}Added {total} frames in total {Style.RESET_ALL}")
cms[db] = cm2(db) except: print 'OOPS!' continue #cms = { db:cm2(db) for db in cmdbs } for iii in xrange( 0,len(clusters) ): ##df = clust_dfs.ix[iii] # dont need for this? if len(clusters[iii]) < 10: break out = get_motif_cluster_sites( iii, force=False ) if out['record'] is None: continue try: pdf_data = plot_motif_from_sites( out['sites'], 'pdf', out['smallText'] ) ut.writeLines( pdf_data.split( '\n' ), 'motif_clusters_%s/%04d.pdf'%(param_I_str,iii) ) ut.writeLines( out['memeOut'].split( '\n' ), 'motif_clusters_%s/%04d_memeOut.txt'%(param_I_str,iii) ) except: print 'ERROR in generating PDF for motif cluster.' print iii, 'DONE' #os.popen( 'pdftk motif_clusters_%s/????.pdf cat output motif_clusters_%s/ALL.pdf'%(param_I_str,param_I_str) ).read() try: os.popen( ('gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=motif_clusters_%s/ALL.pdf ' 'motif_clusters_%s/????.pdf') % (param_I_str,param_I_str) ).read() os.popen( 'pdfnup --landscape --suffix nup --nup 5x6 motif_clusters_%s/ALL.pdf'%(param_I_str) ).read() except: print 'gs or pdfnup not available!' # widths = np.array( [ pssm.shape[0] for pssm in all_pssms.values() ] ) # max_width = np.max( widths )