def prepare_documents(): path = os.getcwd() + "/documents/" dirs = os.listdir(path) docs = [] doc_id = 0 # we should read docs_url.txt first I guess # docs_url = collections.defaultdict(dict) '''with open(path+"docs_url.txt") as f: lines = f.readlines() lines = [x.strip('\n') for x in lines] for line in lines: doc_and_url = line.split() docs_url[doc_and_url[0]] = doc_and_url[1] ''' # read each file, create document object, put into documents[] for file in dirs: if file == "docs_url.txt": continue print(file) tokens = [] with open(path + file, encoding="utf-8") as f: lines = f.readlines() lines = [x.strip('\n') for x in lines] doc_url = lines[0] doc_name = lines[1] doc_pages = lines[2] lines = lines[3:] for line in lines: #tokens.__add__(ck.tokenize(line)) for token in ck.tokenize(line): tokens.append(token) new_doc = Document(doc_id, doc_name, lines, tokens, doc_url, doc_pages) doc_id = doc_id + 1 print(new_doc.__str__()) docs.append(new_doc) return docs
def __str__(self): return Document.__str__(self) + " | Audio "
def __str__(self): return Document.__str__(self) + " | Vidéo "
def __str__(self): return Document.__str__(self)
def __str__(self): return Document.__str__(self) + " | Multimédia"