Ejemplo n.º 1
0
 def add_document(self,
                  url,
                  title,
                  categories,
                  published_time,
                  content,
                  author=None,
                  topics=None,
                  links=None,
                  terms=None,
                  document_id=None):
     if url is None or len(url) == 0:
         raise KeyError("'url' is mandatory")
     elif url in self.url_indices:
         log.info(f"Ignoring duplicate URL={url}")
         return
     new_document = Element("document")
     title = Corpus.unicodify(title)
     new_document.document_id = md5(title.encode("utf-8")).hexdigest()[-6:] if document_id is None or \
         len(document_id) == 0 else document_id
     new_document.url = url
     new_document.title = title
     new_document.author = author
     new_document.published_time = published_time
     # handle lists
     new_document.categories = Element("categories")
     if categories: new_document.categories.category = categories
     new_document.topics = Element("topics")
     if topics: new_document.topics.topic = topics
     new_document.links = Element("links")
     if links: new_document.links.link = links
     new_document.content = Element("content")
     if content:
         new_document.content.p = [
             Corpus.unicodify(p) for p in content if p
         ]
     # handle terms
     new_document.terms = Element("terms")
     terms_list = []
     if terms:
         for term in terms:
             term_elmt = Element("term")
             term_elmt.word = term
             term_elmt.locations = Element("locations")
             locations_list = []
             for location in terms[term]:
                 location_elmt = Element("location")
                 location_elmt.begin, location_elmt.end = location
                 locations_list.append(location_elmt)
             term_elmt.locations.location = locations_list
             terms_list.append(term_elmt)
         new_document.terms.term = terms_list
     self.corpus.append(new_document)
     self.url_indices.append(url)