def retrieve_element(self, element, config): if self.is_aggregate(): og_folder = Path(config["source"]) return Etype.Any(og_folder.name, paths=[x[1] for x in self.results]) else: return Etype.Any(element.id, paths=[element.path])
def retrieve_element(self, element, _): base = TMP / element.id base.mkdir(parents=True, exist_ok=True) with open(base / "tweet.json", "w+") as fp: json.dump(element.__dict__, fp) # retrieve photos if "download_photos" in self.config and self.config.download_photos: photos = element.photos.split(",") if len(photos) < 1 or photos[0] == "": self.logger(f"{element.id} downloaded.") return Etype.cast(element.id, files(base)) for url in photos: fname = url.rsplit("/", 1)[-1] urlretrieve(url, base / fname) self.logger(f"{element.id} downloaded (with images).") if "download_videos" in self.config and self.config.download_videos: if hasattr(element, "video") and element.video != "": fname = element.video.rsplit("/", 1)[-1] urlretrieve(element.video, base / fname) self.disk.delete_local_on_write = True return Etype.cast(element.id, files(base))
def rank(elements: List, threshold=0.5, logger=print, element_id="__RANKING") -> Etype: ranking_data = {} for element in elements: jsons = [f for f in element.paths if f.suffix in ".json"] if len(jsons) != 1: continue jsonp = jsons[0] with open(jsonp, "r") as jsonf: data = json.load(jsonf) try: # TODO: this logic should be a custom etype built from a core etype class... # the core class can then include associated methods. labels = data["labels"] for label, preds in labels.items(): frames, scores = preds["frames"], preds["scores"] valid_frames = [ idx for idx, _ in enumerate(frames) if scores[idx] > threshold ] rank = len(valid_frames) if rank > 4: logger(f"label '{label}': rank {rank}") # gather all ranks in `ranking_data` if label not in ranking_data: ranking_data[label] = {} ranking_data[label][element.id] = rank # dpath = WK_DIR / f"{element.id}.json" logger(f"Rankings indexed for {element.id}.") # return Etype.CvJson(element.id, dpath) return None except Exception as e: logger(f"Could not analyse {element.id}: {e}") ranking = {} for label, values in ranking_data.items(): s_vals = sorted(values.items(), key=operator.itemgetter(1)) s_vals.reverse() s_els = [t[0] for t in s_vals] ranking[label] = s_els path = WK_DIR / "all" if not os.path.exists(path): os.makedirs(path) file = path / "rankings.json" logger("All rankings aggregated, printed to all/rankings.json") with open(file, "w") as f: json.dump(ranking, f) return Etype.Json(element_id, file)
def test_Image(base): # shouldn't accept one txt with pytest.raises(EtypeCastError): Etype.Image(base.id, ["/tmp/notafile.txt"]) # shouldn't accept an image that doesn't exist with pytest.raises(EtypeCastError): Etype.Image(base.id, ["/tmp/nonexistent_image.png"]) # shouldn't be okay with 2 valid images with pytest.raises(EtypeCastError): Etype.Image(base.id, [base.im1, base.im2]) # works with either single path or list im1 = Etype.Image(base.id, base.im1) assert len(im1.paths) == 1 im1 = Etype.Image(base.id, [base.im1]) assert len(im1.paths) == 1 im2 = Etype.Image(base.id, base.im2) assert len(im1.paths) == 1 # filters out invalid files im1_filtered = Etype.Image(base.id, [base.im1, base.txt1]) assert len(im1.paths) == 1 assert im1.paths[0] == base.im1
def post_analyse(self, _): ranking = self.data_to_ranking() path = WK_DIR / "all" if not os.path.exists(path): os.makedirs(path) file = path / "rankings.json" self.logger("All rankings aggregated, printed to all/rankings.json") with open(file, "w") as f: json.dump(ranking, f) return Etype.Json("__RANKING", file)
def read_elements(self, qs: List[str]) -> List[LocalElement]: """Take a list of queries, and returns a flattened list of LocalElements for the specified folders. The order of the query is maintained in the return value.""" els = [] for q in qs: element_pth = self.read_query(q) el_paths = subdirs(element_pth) # TODO: cast elements properly and throw error if they don't conform for el in el_paths: lel = Etype.cast(el.name, files(el)) lel.query = q els.append(lel) return els
def retrieve_element(self, element, _) -> Union(Etype.Video, Etype.Json): with self.ydl: try: result = self.ydl.extract_info(element.url) meta = TMP / element.id / "meta.json" with open(meta, "w+") as fp: json.dump(result, fp) self.logger( f"{element.id}: video and meta downloaded successfully.") self.disk.delete_local_on_write = True return Etype.cast(element.id, files(TMP / element.id)) except youtube_dl.utils.DownloadError: raise ElementShouldSkipError( f"Something went wrong downloading {element.id}. It may have been deleted." )
def analyse_element(self, element: Etype.Audio, config) -> Etype.Audio: output_ext = config["output_ext"] FNULL = open(os.devnull, "w") output = f"/tmp/{element.id}.{output_ext}" # TODO: error handling out = call( ["ffmpeg", "-y", "-i", element.paths[0], output], stdout=FNULL, stderr=STDOUT, ) self.logger( f"Converted '{element.id}' from {element.paths[0].suffix} to .{output_ext}" ) return Etype.Audio(element.id, paths=[output])
def post_analyse(self, elements): for el in elements: el_json = el.paths[0] with open(el_json) as f: tweets = json.load(f) initial_tweet = tweets[0] self.logger(f"Adding tweet {initial_tweet['id']} to graph...") self.add_to_graph(initial_tweet) for tweet in tweets[1:]: self.logger(f"Adding reply {tweet['id']} to graph...") self.add_to_graph(tweet, inreplyto=initial_tweet) xlsx_path = TMP / "final.xlsx" self.graph.to_xlsx(xlsx_path) return Etype.Any("FINAL", xlsx_path)
def retrieve_element(self, element, _): base = TMP / element.id base.mkdir(parents=True, exist_ok=True) fn = element.filename identifier = element.id comment = element.comment url = element.url with open(base / f"{identifier}_comment.txt", "w+") as f: f.write(comment) if url != "": urlretrieve(url, base / fn) return Etype.cast(element.id, files(base))
def post_analyse(self, _): # TODO: a kind of hack... should maybe make available as a func, i.e. `self.get_analysed()` analysed_els = self.disk.read_elements([self.dest_q]) for el in analysed_els: el_json = el.paths[0] with open(el_json) as f: tweets = json.load(f) initial_tweet = tweets[0] self.logger(f"Adding tweet {initial_tweet['id']} to graph...") self.add_to_graph(initial_tweet) for tweet in tweets[1:]: self.logger(f"Adding reply {tweet['id']} to graph...") self.add_to_graph(tweet, inreplyto=initial_tweet) xlsx_path = TMP / "final.xlsx" self.graph.to_xlsx(xlsx_path) return Etype.Any("FINAL", xlsx_path)
def from_preds(element, get_preds): imgs = [p for p in element.paths if p.suffix in IMG_SFXS] labels = {} for imp in imgs: frame_no, preds = deduce_frame_no(imp), get_preds(imp) for pred_label, pred_conf in preds: if pred_label in labels.keys(): labels[pred_label]["frames"].append(frame_no) labels[pred_label]["scores"].append(pred_conf) else: labels[pred_label] = {"frames": [frame_no], "scores": [pred_conf]} meta = [p for p in element.paths if p.suffix in ".json"][0] out = {**prepare_json(meta), "labels": labels} base = TMP / element.id base.mkdir(parents=True, exist_ok=True) outp = base / "preds.json" with open(outp, "w") as fp: json.dump(out, fp) return Etype.Json(element.id, outp)
def test_Any(base): e = Etype.Any(base.id, [base.txt1]) assert len(e.paths) == 1 e = Etype.Any(base.id, [base.txt1, base.md1, base.im3]) assert len(e.paths) == 3
def get_in_etype(self): return Etype.Union(Etype.Image.array(), Etype.Json)
def get_in_etype(self): return Etype.Union(Etype.Json, Etype.Video)
def retrieve_element(self, row, config): return Etype.cast(row.id, row.path)
def get_out_etype(self): return Etype.Union(Etype.Video, Etype.Json)