Ejemplo n.º 1
0
 def analyse_element(self, element: Union(Array(Etype.Image), Etype.Json),
                     _) -> Etype.Json:
     self.logger(f"Running inference on frames in {element.id}...")
     val = Etype.CvJson.from_preds(element, self.get_preds)
     self.logger(f"Wrote predictions JSON for {element.id}.")
     self.disk.delete_local_on_write = True
     return val
Ejemplo n.º 2
0
def test_cast(base):
    # explicit cast
    with pytest.raises(EtypeCastError):
        cast(base.id, [], Etype.Image)
    with pytest.raises(EtypeCastError):
        cast(base.id, [base.txt1], Etype.Image)

    t1 = cast(base.id, [base.im1], to=Etype.Image)
    assert len(t1.paths) == 1
    assert t1.et == Etype.Image

    # implicit cast
    with pytest.raises(EtypeCastError):
        cast(base.id, [])

    i1 = cast(base.id, [base.im1])
    assert len(i1.paths) == 1
    assert i1.et == Etype.Image
    i2 = cast(base.id, [base.im2])
    assert len(i2.paths) == 1
    assert i2.et == Etype.Image

    ia1 = cast(base.id, [base.im1, base.im2])
    assert len(ia1.paths) == 2
    assert ia1.et == Array(Etype.Image)

    a1 = cast(base.id, base.aud1)
    assert len(a1.paths) == 1
    assert a1.et == Etype.Audio

    # unions

    ai1 = cast(base.id, [base.im3, base.aud1])
    assert len(ai1.paths) == 2
    assert ai1.et == Union(Etype.Image, Etype.Audio)

    ai2 = cast(base.id, [base.aud1, base.im2])
    assert len(ai1.paths) == 2
    assert ai1.et == Union(Etype.Image, Etype.Audio)

    iaa1 = cast(base.id, [base.im1, base.im2, base.aud1])
    assert len(iaa1.paths) == 3
    assert iaa1.et == Union(Array(Etype.Image), Etype.Audio)

    any1 = cast(base.id, [base.im1, base.im2, base.aud1, base.txt1])
    assert len(any1.paths) == 4
    assert any1.et == Etype.Any
Ejemplo n.º 3
0
class KerasPretrained(Analyser):
    in_etype = Union(Array(Etype.Image), Etype.Json)
    out_etype = Etype.Json

    def pre_analyse(self, config):
        self.logger(config["model"])
        self.logger(f"Storing models in {KERAS_HOME}")
        MOD = SUPPORTED_MODELS.get(config["model"])
        if MOD is None:
            raise InvalidAnalyserConfigError(
                f"The module '{config['model']}' either does not exist, or is not yet supported."
            )

        rLabels = config["labels"]

        # TODO: make it so that this doesn't redownload every run.
        # i.e. refactor it into partial.Dockerfile
        self.model_module = import_module(
            f"keras.applications.{MOD['module']}")
        impmodel = getattr(self.model_module, config["model"])
        # NB: this downloads the weights if they don't exist
        self.model = impmodel(weights="imagenet")
        self.THRESH = 0.1

        # revert to serial if CPU (TODO: debug why parallel CPU doesn't work)
        if not tf.test.is_gpu_available():
            self.in_parallel = False

        def get_preds(img_path):
            img = load_img(img_path, target_size=(224, 224))
            x = img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = self.model_module.preprocess_input(x)
            preds = self.model.predict(x)

            # top field must be included or defaults to 5, huge number ensures
            # it gets all labels
            decoded = self.model_module.decode_predictions(preds, top=10)

            # filter by labels provided in whitelist
            filteredPreds = [p for p in decoded[0] if p[1] in rLabels]

            # return map(lambda x: (x[1], float(x[2])), filteredPreds)
            return [(x[1], float(x[2])) for x in filteredPreds
                    if float(x[2]) >= self.THRESH]

        self.get_preds = get_preds

    def analyse_element(self, element, _):
        self.logger(f"Running inference on frames in {element.id}...")
        val = Etype.CvJson.from_preds(element, self.get_preds)
        self.logger(f"Wrote predictions JSON for {element.id}.")
        self.disk.delete_local_on_write = True
        return val

    def post_analyse(self, elements) -> Etype.Json:
        return rank(elements, logger=self.logger)
Ejemplo n.º 4
0
 def retrieve_element(self, element, _) -> Union(Etype.Video, Etype.Json):
     with self.ydl:
         try:
             result = self.ydl.extract_info(element.url)
             meta = TMP / element.id / "meta.json"
             with open(meta, "w+") as fp:
                 json.dump(result, fp)
             self.logger(
                 f"{element.id}: video and meta downloaded successfully.")
             self.disk.delete_local_on_write = True
             return Etype.cast(element.id, files(TMP / element.id))
         except youtube_dl.utils.DownloadError:
             raise ElementShouldSkipError(
                 f"Something went wrong downloading {element.id}. It may have been deleted."
             )
Ejemplo n.º 5
0
def test_Union(base):
    ImAud = Union(Etype.Image, Etype.Audio)
    with pytest.raises(EtypeCastError):
        ImAud(base.id, [])
    with pytest.raises(EtypeCastError):
        ImAud(base.id, base.txt1)
    with pytest.raises(EtypeCastError):
        ImAud(base.id, base.im1)
    with pytest.raises(EtypeCastError):
        ImAud(base.id, base.aud1)

    has2 = ImAud(base.id, [base.aud1, base.im1])
    assert len(has2.paths) == 2
    f2 = ImAud(base.id, [base.im3, base.md1, base.aud1])
    assert len(f2.paths) == 2
    assert base.im3 in f2.paths
    assert base.aud1 in f2.paths
Ejemplo n.º 6
0
    def analyse_element(self, element: Union(Etype.Json, Etype.Video),
                        config) -> GLOSSED_FRAMES:
        fps = int(config["fps"]) if "fps" in config else 1
        jsons = [x for x in element.paths if x.suffix in ".json"]
        dest = Path("/tmp") / element.id
        if dest.exists():
            rmtree(dest)
        dest.mkdir()

        if len(jsons) is 1:
            json = jsons[0]
            copyfile(json, dest / "meta.json")

        video = [x for x in element.paths if x.suffix in VID_SUFFIXES][0]
        ffmpeg_frames(dest, video, fps)

        self.logger(f"Frames successfully created for element {element.id}.")
        self.disk.delete_local_on_write = True
        return GLOSSED_FRAMES(element.id, paths=files(dest))
Ejemplo n.º 7
0
class Youtube(Selector):
    out_etype = Union(Etype.Json, Etype.Video)

    def index(self, _) -> LocalElementsIndex:
        results = self._run()
        if len(results) > 0:
            out = []
            out.append(list(results[0].keys()))
            out.extend([x.values() for x in results])
            return LocalElementsIndex(out)
        return None

    def pre_retrieve(self, _):
        self.ydl = youtube_dl.YoutubeDL({
            "outtmpl": f"{TMP}/%(id)s/%(id)s.mp4",
            "format": "worstvideo[ext=mp4]",
        })

    def retrieve_element(self, element, _):
        with self.ydl:
            try:
                result = self.ydl.extract_info(element.url)
                meta = TMP / element.id / "meta.json"
                with open(meta, "w+") as fp:
                    json.dump(result, fp)
                self.logger(
                    f"{element.id}: video and meta downloaded successfully.")
                self.disk.delete_local_on_write = True
                return Etype.cast(element.id, files(TMP / element.id))
            except youtube_dl.utils.DownloadError:
                raise ElementShouldSkipError(
                    f"Something went wrong downloading {element.id}. It may have been deleted."
                )

    def _run(self):
        self.logger(f"Query: {self.config['search_term']}")
        if "uploaded_after" in self.config:
            self.logger(f"Start: {self.config['uploaded_after']}")

        if "uploaded_before" in self.config:
            self.logger(f"End: {self.config['uploaded_before']}")

        if self.config.get("daily"):
            results = []
            self.logger(
                f"Scraping daily, from {self.config['uploaded_after']} -- {self.config['uploaded_before']}"
            )
            self.logger("-----------------")
            for after, before in self._days_between(
                    self.config["uploaded_after"],
                    self.config["uploaded_before"]):
                results = results + self.get_results(before, after)

        else:
            results = self.get_results(self.config.get("uploaded_before"),
                                       self.config.get("uploaded_after"))

        self.logger("\n\n----------------")
        self.logger(f"Scrape successful, {len(results) - 1} results.")

        return results

    def get_results(self, before, after):
        args_obj = {"q": self.config["search_term"]}

        if before is not None:
            args_obj["before"] = self.config["uploaded_before"]
        if "uploaded_after" in self.config.keys():
            args_obj["after"] = self.config["uploaded_after"]

        new_results = self._youtube_search_all_pages(args_obj)
        if new_results is None:
            raise Exception("Something went wrong")
        return new_results

    def _add_to_csv_obj(self, csv_obj, s_res):
        for search_result in s_res:
            videoId = search_result["id"]["videoId"]
            title = search_result["snippet"]["title"]
            channelId = search_result["snippet"]["channelId"]
            desc = search_result["snippet"]["description"]
            publishedAt = search_result["snippet"]["publishedAt"]
            url = f"https://www.youtube.com/watch?v={videoId}"
            id = self._id_from_url(url)
            csv_obj.append({
                "url": url,
                "title": title.replace(",", ";"),
                "desc": desc.replace(",", ";"),
                "published": publishedAt[0:10],
                "id": id,
            })
        return csv_obj

    def _youtube_search_all_pages(self, args):
        csv_obj = []
        self.logger(
            f"Search terms: {args['q']}\n Start: {args['after'] if 'after' in args else ''}\n End: {args['before'] if 'before' in args else ''}"
        )
        try:
            s_res = self._youtube_search(args)
            count = 0
            while ("nextPageToken"
                   in s_res) and (len(s_res.get("items", [])) != 0):
                self.logger(f"\tScraping page {count}...")
                count += 1
                csv_obj = self._add_to_csv_obj(csv_obj, s_res.get("items", []))
                s_res = self._youtube_search(args,
                                             pageToken=s_res["nextPageToken"])
            # add the last one
            self.logger("\tAll pages scraped.")
            if count > 1:
                csv_obj = self._add_to_csv_obj(csv_obj, s_res.get("items", []))
            return csv_obj
        except HttpError as e:
            self.logger(f"An HTTP error {e.resp.status} occured.")
            print(e.content)
            return None

    def _youtube_search(self, options, pageToken=None):
        # modified from https://github.com/youtube/api-samples/blob/master/python/search.py
        if API_KEY is None:
            raise ElementShouldSkipError("No GOOGLE_API_KEY specified in .env")
        youtube = googleapiclient.discovery.build(YOUTUBE_API_SERVICE_NAME,
                                                  YOUTUBE_API_VERSION,
                                                  developerKey=API_KEY)

        theargs = {
            "pageToken": pageToken,
            "q": options["q"],
            "part": "id,snippet",
            "maxResults": 50,
            "safeSearch": "none",
            "type": "video",
        }

        if "before" in options:
            theargs["publishedBefore"] = options["before"]
        if "after" in options:
            theargs["publishedAfter"] = options["after"]

        request = youtube.search().list(**theargs)

        return request.execute()

    def _days_between(self, start, end):
        bef = datetime.strptime(end[:-1], "%Y-%m-%dT%H:%M:%S")
        aft = datetime.strptime(start[:-1], "%Y-%m-%dT%H:%M:%S")
        between = (bef - aft).days
        return [(
            ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "00:00:00Z"),
            ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "23:59:59Z"),
        ) for dt in range(between)]

    def _id_from_url(self, url):
        id_search = re.search("https:\/\/www\.youtube\.com\/watch\?v\=(.*)",
                              url, re.IGNORECASE)
        if id:
            return id_search.group(1)
        return None