def _preprocessing_test(self, anns, new_anns, qid, vids):
     """ Preprocessing annotations
     Args:
         anns: annotations
         qid: start query id
     Returns:
         new_anns: preprocessed annotations
         qid: last query id
     """
     translator = str.maketrans("", "", string.punctuation)
     for vid in anns.keys():
         ann = anns[vid]
         duration = ann["duration"]
         for ts,q in zip(ann["timestamps"], ann["sentences"]):
             id_range = [qid]
             new_anns[str(qid)] = {
                 "timestamps": ts,
                 "query": q,
                 "tokens": utils.tokenize(q.lower(), translator),
                 "duration": duration,
                 "video_id": vid
             }
             qid += 1
     vids.extend(list(anns.keys()))
     return new_anns, qid, list(set(vids))
    def _preprocessing(self, anns, aux_ann_path):
        """ Preprocessing annotations
        Args:
            anns: annotations
            aux_ann_path: path for annotations for auxiliary information (e.g., duration)
        Returns:
            new_anns: preprocessed annotations
        """
        aux_anns = io_utils.load_csv(aux_ann_path)
        vid2len = {ann["id"]: ann["length"] for ann in aux_anns}
        vids = []

        new_anns = dict()
        translator = str.maketrans("", "", string.punctuation)
        for qid, ann in enumerate(anns):
            info, query = ann.split("##")
            vid, spos, epos = info.split(" ")
            duration = vid2len[vid]
            new_anns[str(qid)] = {
                "timestamps": [float(spos), float(epos)],
                "query": query,
                "tokens": utils.tokenize(query.lower(), translator),
                "duration": float(duration),
                "video_id": vid
            }
            vids.append(vid)
        return new_anns, list(set(vids))
Beispiel #3
0
 def split_in_paragraphs(document: WikipediaDocument,
                         max_n_tokens=300) -> List[str]:
     splitted_documents = []
     tokens = utils.tokenize(document.content)
     total_tokens = len(tokens)
     i = 0
     current_split_tokens = []
     for tok in tokens:
         i += 1
         if i >= max_n_tokens or i >= total_tokens:
             i = 0
             paragraph = ' '.join(current_split_tokens)
             splitted_documents.append(
                 WikipediaDocument(id=document.id,
                                   title=document.title,
                                   content=paragraph,
                                   label=document.label))
             current_split_tokens = []
         else:
             current_split_tokens.append(tok)
     return splitted_documents