Esempio n. 1
0
    def prepare_documents():
        path = os.getcwd() + "/documents/"
        dirs = os.listdir(path)

        docs = []
        doc_id = 0

        # we should read docs_url.txt first I guess
        # docs_url = collections.defaultdict(dict)
        '''with open(path+"docs_url.txt") as f:
            lines = f.readlines()
        lines = [x.strip('\n') for x in lines]
        for line in lines:
            doc_and_url = line.split()
            docs_url[doc_and_url[0]] = doc_and_url[1]
        '''

        # read each file, create document object, put into documents[]
        for file in dirs:
            if file == "docs_url.txt":
                continue
            print(file)

            tokens = []

            with open(path + file, encoding="utf-8") as f:
                lines = f.readlines()
            lines = [x.strip('\n') for x in lines]

            doc_url = lines[0]
            doc_name = lines[1]
            doc_pages = lines[2]
            lines = lines[3:]

            for line in lines:
                #tokens.__add__(ck.tokenize(line))
                for token in ck.tokenize(line):
                    tokens.append(token)

            new_doc = Document(doc_id, doc_name, lines, tokens, doc_url,
                               doc_pages)
            doc_id = doc_id + 1
            print(new_doc.__str__())
            docs.append(new_doc)

        return docs
Esempio n. 2
0
 def __str__(self):
     return Document.__str__(self) + " | Audio "
Esempio n. 3
0
 def __str__(self):
     return Document.__str__(self) + " | Vidéo "
Esempio n. 4
0
 def __str__(self):
     return Document.__str__(self)
Esempio n. 5
0
 def __str__(self):
     return Document.__str__(self) + " | Multimédia"