def get_docs_from_xml(self, root): docs = [] for channel in root: for item in channel.findall("item"): new_doc = Document() new_doc.title = item.find("title").text or "" new_doc.download_date = datetime.now(tz.tzutc()) new_doc.publish_date = dateparser.parse(item.find("pubDate").text, "") or new_doc.download_date if new_doc.publish_date.tzinfo is None or self.force_timezone: new_doc.publish_date=new_doc.publish_date.replace(tzinfo=self.timezone) new_doc.publish_date = new_doc.publish_date.astimezone(tz.tzutc()) new_doc.source_url = item.find("link").text or "" new_doc.original_summary = strip_html(item.find("description").text or "") if item.find("guid"): new_doc.guid = hashlib.md5(item.find("guid").encode('utf-8')).hexdigest() else: new_doc.guid = hashlib.md5(new_doc.source_url.encode('utf-8')).hexdigest() new_doc.provider = self.name if new_doc.guid not in self.processed_guids: self.processed_guids[new_doc.guid] = True self.document_count += 1 docs.append(new_doc) return docs
def sentences(self): tmp_sentences = [Sentence(utils.strip_html(self.title))] tmp_sentences.extend([Sentence(utils.strip_html(s)) for s in self.original_summary.split('[.]') if len(s.strip()) > 0]) tmp_sentences.extend([Sentence(utils.strip_html(s)) for s in self.content.split('[.]') if len(s.strip()) > 0]) return tmp_sentences