def modified_fulltext(parser, language, url):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    url_parsed = urlparse(url)

    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)

    doc = parser
    doc = rm_ads(doc,url_parsed.hostname)
    doc = clean(document_cleaner,doc)
    #doc = document_cleaner.clean(doc)
    doc = calculate_best_node(extractor,doc)
    #doc = extractor.calculate_best_node(doc)
    if doc is not None:
        #doc = extractor.add_siblings(doc)
        doc = post_cleanup(doc)
        #doc = extractor.post_cleanup(doc)
        text, html = get_formatted(doc)
        #text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }
Ejemplo n.º 2
0
    def get_article_content(self, html):
        """
        Возвращает контент публикации в виде HTML-разметки.
        """
        is_header = lambda tag: re.match(r'h(1|2|3|4|5|6)', tag)

        tree = lxml.html.document_fromstring(html)
        body = tree.find('body')
        article = []
        for elem in body.iter('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
            if elem.text and elem.text.strip():
                if is_header(elem.tag):
                    if article:
                        prev_elem = article[-1]
                        if prev_elem.tag == 'p':
                            article.append(elem)
                    else:
                        article.append(elem)
                else:
                    article.append(elem)
            elif elem.tag == 'p':
                span = elem.find('span')
                if span is not None and span.text and span.text.strip() and len(span.text.strip().split()) > 1:
                    span.tag = 'p'
                    article.append(span)
        # Убираем заголовки в конце списка
        while True:
            try:
                last = article.pop()
                if not is_header(last.tag):
                    article.append(last)
                    break
            except IndexError:
                break

        cleaner = HTMLCleaner()
        clean = lambda v: cleaner.clean_html(v)
        tostr = lambda v: lxml.html.tostring(v, encoding='unicode')
        return ''.join([tostr(clean(elem)) for elem in article])
Ejemplo n.º 3
0
def clean():
    print "cleaning"
    for image in os.listdir("."):
        if image.endswith(".png") or  image.endswith(".html") or image.endswith(".gif") or image.endswith(".jpg"):
            source = image
            destination = "./uploaded/" + image
            if os.path.isdir("uploaded"):
                shutil.move(source, destination)
            else:
                os.mkdir("uploaded")
                shutil.move(source, destination)
#print "Moving the file " + image + " to the folder 'uploaded' "


f= open(name + "-clean.html")
test = f.read()
post = WordPressPost()

post.title = name
post.content = test
#post.post_status = 'publish'

try:
	wp.call(NewPost(post))
except:
    clean()

clean()