def analyse_cvjson(self, element: Etype.CvJson): jsons = [f for f in element.paths if f.suffix in ".json"] if len(jsons) != 1: raise ElementShouldSkipError( f"Not exactly one json in {element.id}") jsonp = jsons[0] with open(jsonp, "r") as f: data = json.load(f) try: # TODO: this logic should be a custom etype built from a core etype class... # the core class can then include associated methods. labels = data["labels"] for label, preds in labels.items(): frames, scores = preds["frames"], preds["scores"] valid_frames = [ idx for idx, _ in enumerate(frames) if scores[idx] > self.thresh ] rank = len(valid_frames) if rank > 4: self.logger(f"label '{label}': rank {rank}") # gather all ranks in `ranking_data` if label not in self.ranking_data: self.ranking_data[label] = {} self.ranking_data[label][element.id] = rank dpath = WK_DIR / f"{element.id}.json" self.logger(f"Rankings indexed for {element.id}.") # return Etype.CvJson(element.id, dpath) return None except Exception as e: raise ElementShouldSkipError(str(e))
def _youtube_search(self, options, pageToken=None): # modified from https://github.com/youtube/api-samples/blob/master/python/search.py if API_KEY is None: raise ElementShouldSkipError("No GOOGLE_API_KEY specified in .env") youtube = googleapiclient.discovery.build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY) theargs = { "pageToken": pageToken, "q": options["q"], "part": "id,snippet", "maxResults": 50, "safeSearch": "none", "type": "video", } if "before" in options: theargs["publishedBefore"] = options["before"] if "after" in options: theargs["publishedAfter"] = options["after"] request = youtube.search().list(**theargs) return request.execute()
def retrieve_element(self, element, config) -> LocalElement: if element.id == "skip": raise ElementShouldSkipError("test") elif element.id == "retry3" and self.retryCount < 3: self.retryCount += 1 raise ElementShouldRetryError("test") elif element.id == "retryN": raise ElementShouldRetryError("test") else: return None
def analyse_element(self, element, config): if element.id == "skip": raise ElementShouldSkipError("test") elif element.id == "retry3" and self.retryCount < 3: self.retryCount += 1 raise ElementShouldRetryError("test") elif element.id == "retryN": raise ElementShouldRetryError("test") else: pass
def retrieve_element(self, element, _) -> Union(Etype.Video, Etype.Json): with self.ydl: try: result = self.ydl.extract_info(element.url) meta = TMP / element.id / "meta.json" with open(meta, "w+") as fp: json.dump(result, fp) self.logger( f"{element.id}: video and meta downloaded successfully.") self.disk.delete_local_on_write = True return Etype.cast(element.id, files(TMP / element.id)) except youtube_dl.utils.DownloadError: raise ElementShouldSkipError( f"Something went wrong downloading {element.id}. It may have been deleted." )