def add_document(self, url, title, categories, published_time, content, author=None, topics=None, links=None, terms=None, document_id=None): if url is None or len(url) == 0: raise KeyError("'url' is mandatory") elif url in self.url_indices: log.info(f"Ignoring duplicate URL={url}") return new_document = Element("document") title = Corpus.unicodify(title) new_document.document_id = md5(title.encode("utf-8")).hexdigest()[-6:] if document_id is None or \ len(document_id) == 0 else document_id new_document.url = url new_document.title = title new_document.author = author new_document.published_time = published_time # handle lists new_document.categories = Element("categories") if categories: new_document.categories.category = categories new_document.topics = Element("topics") if topics: new_document.topics.topic = topics new_document.links = Element("links") if links: new_document.links.link = links new_document.content = Element("content") if content: new_document.content.p = [ Corpus.unicodify(p) for p in content if p ] # handle terms new_document.terms = Element("terms") terms_list = [] if terms: for term in terms: term_elmt = Element("term") term_elmt.word = term term_elmt.locations = Element("locations") locations_list = [] for location in terms[term]: location_elmt = Element("location") location_elmt.begin, location_elmt.end = location locations_list.append(location_elmt) term_elmt.locations.location = locations_list terms_list.append(term_elmt) new_document.terms.term = terms_list self.corpus.append(new_document) self.url_indices.append(url)