def modified_fulltext(parser, language, url): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' url_parsed = urlparse(url) from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = rm_ads(doc,url_parsed.hostname) doc = clean(document_cleaner,doc) #doc = document_cleaner.clean(doc) doc = calculate_best_node(extractor,doc) #doc = extractor.calculate_best_node(doc) if doc is not None: #doc = extractor.add_siblings(doc) doc = post_cleanup(doc) #doc = extractor.post_cleanup(doc) text, html = get_formatted(doc) #text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }
def get_article_content(self, html): """ Возвращает контент публикации в виде HTML-разметки. """ is_header = lambda tag: re.match(r'h(1|2|3|4|5|6)', tag) tree = lxml.html.document_fromstring(html) body = tree.find('body') article = [] for elem in body.iter('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): if elem.text and elem.text.strip(): if is_header(elem.tag): if article: prev_elem = article[-1] if prev_elem.tag == 'p': article.append(elem) else: article.append(elem) else: article.append(elem) elif elem.tag == 'p': span = elem.find('span') if span is not None and span.text and span.text.strip() and len(span.text.strip().split()) > 1: span.tag = 'p' article.append(span) # Убираем заголовки в конце списка while True: try: last = article.pop() if not is_header(last.tag): article.append(last) break except IndexError: break cleaner = HTMLCleaner() clean = lambda v: cleaner.clean_html(v) tostr = lambda v: lxml.html.tostring(v, encoding='unicode') return ''.join([tostr(clean(elem)) for elem in article])
def clean(): print "cleaning" for image in os.listdir("."): if image.endswith(".png") or image.endswith(".html") or image.endswith(".gif") or image.endswith(".jpg"): source = image destination = "./uploaded/" + image if os.path.isdir("uploaded"): shutil.move(source, destination) else: os.mkdir("uploaded") shutil.move(source, destination) #print "Moving the file " + image + " to the folder 'uploaded' " f= open(name + "-clean.html") test = f.read() post = WordPressPost() post.title = name post.content = test #post.post_status = 'publish' try: wp.call(NewPost(post)) except: clean() clean()