def analyse_element(self, element: Union(Array(Etype.Image), Etype.Json), _) -> Etype.Json: self.logger(f"Running inference on frames in {element.id}...") val = Etype.CvJson.from_preds(element, self.get_preds) self.logger(f"Wrote predictions JSON for {element.id}.") self.disk.delete_local_on_write = True return val
def test_cast(base): # explicit cast with pytest.raises(EtypeCastError): cast(base.id, [], Etype.Image) with pytest.raises(EtypeCastError): cast(base.id, [base.txt1], Etype.Image) t1 = cast(base.id, [base.im1], to=Etype.Image) assert len(t1.paths) == 1 assert t1.et == Etype.Image # implicit cast with pytest.raises(EtypeCastError): cast(base.id, []) i1 = cast(base.id, [base.im1]) assert len(i1.paths) == 1 assert i1.et == Etype.Image i2 = cast(base.id, [base.im2]) assert len(i2.paths) == 1 assert i2.et == Etype.Image ia1 = cast(base.id, [base.im1, base.im2]) assert len(ia1.paths) == 2 assert ia1.et == Array(Etype.Image) a1 = cast(base.id, base.aud1) assert len(a1.paths) == 1 assert a1.et == Etype.Audio # unions ai1 = cast(base.id, [base.im3, base.aud1]) assert len(ai1.paths) == 2 assert ai1.et == Union(Etype.Image, Etype.Audio) ai2 = cast(base.id, [base.aud1, base.im2]) assert len(ai1.paths) == 2 assert ai1.et == Union(Etype.Image, Etype.Audio) iaa1 = cast(base.id, [base.im1, base.im2, base.aud1]) assert len(iaa1.paths) == 3 assert iaa1.et == Union(Array(Etype.Image), Etype.Audio) any1 = cast(base.id, [base.im1, base.im2, base.aud1, base.txt1]) assert len(any1.paths) == 4 assert any1.et == Etype.Any
class KerasPretrained(Analyser): in_etype = Union(Array(Etype.Image), Etype.Json) out_etype = Etype.Json def pre_analyse(self, config): self.logger(config["model"]) self.logger(f"Storing models in {KERAS_HOME}") MOD = SUPPORTED_MODELS.get(config["model"]) if MOD is None: raise InvalidAnalyserConfigError( f"The module '{config['model']}' either does not exist, or is not yet supported." ) rLabels = config["labels"] # TODO: make it so that this doesn't redownload every run. # i.e. refactor it into partial.Dockerfile self.model_module = import_module( f"keras.applications.{MOD['module']}") impmodel = getattr(self.model_module, config["model"]) # NB: this downloads the weights if they don't exist self.model = impmodel(weights="imagenet") self.THRESH = 0.1 # revert to serial if CPU (TODO: debug why parallel CPU doesn't work) if not tf.test.is_gpu_available(): self.in_parallel = False def get_preds(img_path): img = load_img(img_path, target_size=(224, 224)) x = img_to_array(img) x = np.expand_dims(x, axis=0) x = self.model_module.preprocess_input(x) preds = self.model.predict(x) # top field must be included or defaults to 5, huge number ensures # it gets all labels decoded = self.model_module.decode_predictions(preds, top=10) # filter by labels provided in whitelist filteredPreds = [p for p in decoded[0] if p[1] in rLabels] # return map(lambda x: (x[1], float(x[2])), filteredPreds) return [(x[1], float(x[2])) for x in filteredPreds if float(x[2]) >= self.THRESH] self.get_preds = get_preds def analyse_element(self, element, _): self.logger(f"Running inference on frames in {element.id}...") val = Etype.CvJson.from_preds(element, self.get_preds) self.logger(f"Wrote predictions JSON for {element.id}.") self.disk.delete_local_on_write = True return val def post_analyse(self, elements) -> Etype.Json: return rank(elements, logger=self.logger)
def retrieve_element(self, element, _) -> Union(Etype.Video, Etype.Json): with self.ydl: try: result = self.ydl.extract_info(element.url) meta = TMP / element.id / "meta.json" with open(meta, "w+") as fp: json.dump(result, fp) self.logger( f"{element.id}: video and meta downloaded successfully.") self.disk.delete_local_on_write = True return Etype.cast(element.id, files(TMP / element.id)) except youtube_dl.utils.DownloadError: raise ElementShouldSkipError( f"Something went wrong downloading {element.id}. It may have been deleted." )
def test_Union(base): ImAud = Union(Etype.Image, Etype.Audio) with pytest.raises(EtypeCastError): ImAud(base.id, []) with pytest.raises(EtypeCastError): ImAud(base.id, base.txt1) with pytest.raises(EtypeCastError): ImAud(base.id, base.im1) with pytest.raises(EtypeCastError): ImAud(base.id, base.aud1) has2 = ImAud(base.id, [base.aud1, base.im1]) assert len(has2.paths) == 2 f2 = ImAud(base.id, [base.im3, base.md1, base.aud1]) assert len(f2.paths) == 2 assert base.im3 in f2.paths assert base.aud1 in f2.paths
def analyse_element(self, element: Union(Etype.Json, Etype.Video), config) -> GLOSSED_FRAMES: fps = int(config["fps"]) if "fps" in config else 1 jsons = [x for x in element.paths if x.suffix in ".json"] dest = Path("/tmp") / element.id if dest.exists(): rmtree(dest) dest.mkdir() if len(jsons) is 1: json = jsons[0] copyfile(json, dest / "meta.json") video = [x for x in element.paths if x.suffix in VID_SUFFIXES][0] ffmpeg_frames(dest, video, fps) self.logger(f"Frames successfully created for element {element.id}.") self.disk.delete_local_on_write = True return GLOSSED_FRAMES(element.id, paths=files(dest))
class Youtube(Selector): out_etype = Union(Etype.Json, Etype.Video) def index(self, _) -> LocalElementsIndex: results = self._run() if len(results) > 0: out = [] out.append(list(results[0].keys())) out.extend([x.values() for x in results]) return LocalElementsIndex(out) return None def pre_retrieve(self, _): self.ydl = youtube_dl.YoutubeDL({ "outtmpl": f"{TMP}/%(id)s/%(id)s.mp4", "format": "worstvideo[ext=mp4]", }) def retrieve_element(self, element, _): with self.ydl: try: result = self.ydl.extract_info(element.url) meta = TMP / element.id / "meta.json" with open(meta, "w+") as fp: json.dump(result, fp) self.logger( f"{element.id}: video and meta downloaded successfully.") self.disk.delete_local_on_write = True return Etype.cast(element.id, files(TMP / element.id)) except youtube_dl.utils.DownloadError: raise ElementShouldSkipError( f"Something went wrong downloading {element.id}. It may have been deleted." ) def _run(self): self.logger(f"Query: {self.config['search_term']}") if "uploaded_after" in self.config: self.logger(f"Start: {self.config['uploaded_after']}") if "uploaded_before" in self.config: self.logger(f"End: {self.config['uploaded_before']}") if self.config.get("daily"): results = [] self.logger( f"Scraping daily, from {self.config['uploaded_after']} -- {self.config['uploaded_before']}" ) self.logger("-----------------") for after, before in self._days_between( self.config["uploaded_after"], self.config["uploaded_before"]): results = results + self.get_results(before, after) else: results = self.get_results(self.config.get("uploaded_before"), self.config.get("uploaded_after")) self.logger("\n\n----------------") self.logger(f"Scrape successful, {len(results) - 1} results.") return results def get_results(self, before, after): args_obj = {"q": self.config["search_term"]} if before is not None: args_obj["before"] = self.config["uploaded_before"] if "uploaded_after" in self.config.keys(): args_obj["after"] = self.config["uploaded_after"] new_results = self._youtube_search_all_pages(args_obj) if new_results is None: raise Exception("Something went wrong") return new_results def _add_to_csv_obj(self, csv_obj, s_res): for search_result in s_res: videoId = search_result["id"]["videoId"] title = search_result["snippet"]["title"] channelId = search_result["snippet"]["channelId"] desc = search_result["snippet"]["description"] publishedAt = search_result["snippet"]["publishedAt"] url = f"https://www.youtube.com/watch?v={videoId}" id = self._id_from_url(url) csv_obj.append({ "url": url, "title": title.replace(",", ";"), "desc": desc.replace(",", ";"), "published": publishedAt[0:10], "id": id, }) return csv_obj def _youtube_search_all_pages(self, args): csv_obj = [] self.logger( f"Search terms: {args['q']}\n Start: {args['after'] if 'after' in args else ''}\n End: {args['before'] if 'before' in args else ''}" ) try: s_res = self._youtube_search(args) count = 0 while ("nextPageToken" in s_res) and (len(s_res.get("items", [])) != 0): self.logger(f"\tScraping page {count}...") count += 1 csv_obj = self._add_to_csv_obj(csv_obj, s_res.get("items", [])) s_res = self._youtube_search(args, pageToken=s_res["nextPageToken"]) # add the last one self.logger("\tAll pages scraped.") if count > 1: csv_obj = self._add_to_csv_obj(csv_obj, s_res.get("items", [])) return csv_obj except HttpError as e: self.logger(f"An HTTP error {e.resp.status} occured.") print(e.content) return None def _youtube_search(self, options, pageToken=None): # modified from https://github.com/youtube/api-samples/blob/master/python/search.py if API_KEY is None: raise ElementShouldSkipError("No GOOGLE_API_KEY specified in .env") youtube = googleapiclient.discovery.build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY) theargs = { "pageToken": pageToken, "q": options["q"], "part": "id,snippet", "maxResults": 50, "safeSearch": "none", "type": "video", } if "before" in options: theargs["publishedBefore"] = options["before"] if "after" in options: theargs["publishedAfter"] = options["after"] request = youtube.search().list(**theargs) return request.execute() def _days_between(self, start, end): bef = datetime.strptime(end[:-1], "%Y-%m-%dT%H:%M:%S") aft = datetime.strptime(start[:-1], "%Y-%m-%dT%H:%M:%S") between = (bef - aft).days return [( ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "00:00:00Z"), ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "23:59:59Z"), ) for dt in range(between)] def _id_from_url(self, url): id_search = re.search("https:\/\/www\.youtube\.com\/watch\?v\=(.*)", url, re.IGNORECASE) if id: return id_search.group(1) return None