def extract_article(url, html):
    article = Article(url, language='zh')

    # Special handling: make sure </html> is at the end of the document
    html = html_body_pat.sub(' ', html) + '</body></html>'
    article.download(input_html=html)

    article.parse()
    top_node = article.top_node
    title = article.title
    meta_title = None
    h1_title = None
    if len(article.titles) > 1:
        meta_title = article.titles[0]
        h1_title = article.titles[1]

        if top_node is not None:
            # Select best title according to content
            content = ''.join(top_node.itertext())
            score1 = SequenceMatcher(None, article.titles[0], content).ratio()
            score2 = SequenceMatcher(None, article.titles[1], content).ratio()

            if score1 >= score2:
                title = article.title
            else:
                title = article.titles[1]

    title = strip_site_names_from_title(title).strip()
    if meta_title:
        meta_title = strip_site_names_from_title(meta_title).strip()
    if h1_title:
        h1_title = strip_site_names_from_title(h1_title).strip()

    if title and len(title) > 6 and len(article.top_nodes) > 1:
        # Select best content according to title
        content_map = {}
        for node in article.top_nodes:
            content = escape_pat.sub(' ', ''.join(node.itertext()))
            content = blank_pat.sub(
                '', content[get_content_index(title, content):])
            if content not in content_map:
                content_map[content] = node

        bests = process.extractBests(title, content_map.keys())
        if content_map[bests[0][0]] != article.top_node:
            top_score = 0
            top_len = 0
            for item in bests:
                if content_map[item[0]] == article.top_node:
                    top_score = item[1]
                    top_len = len(item[0])

            if bests[0][1] > top_score * 2 and (len(bests[0]) >= 50 or
                                                len(bests[0]) >= top_len / 2):
                top_node = content_map[bests[0][0]]

    if len(article.top_nodes) > 1:
        # Promote to parent if similar to siblings
        top_text = blank_pat.sub('', ''.join(top_node.itertext()))
        for node in article.top_nodes:
            node_text = None
            if node == top_node.getparent():
                node_text = blank_pat.sub('', ''.join(node.itertext()))
                node_text = node_text.replace(top_text, '')
            elif node.getparent() == top_node.getparent():
                node_text = blank_pat.sub('', ''.join(node.itertext()))

            if node_text:
                s = SequenceMatcher(None, top_text, node_text)
                if s.ratio() >= 0.15:
                    top_node = top_node.getparent()
                    break

    if top_node is None:
        # Try to identify top_node
        # First search for title in html_text
        tree = cleaner.clean_html(lxml.html.parse(StringIO(article.html)))
        #parser = etree.HTMLParser()
        #tree = etree.parse(StringIO(article.html), parser)
        elem = tree.getroot()
        while True:
            children = list(elem)
            node = None
            for i in range(len(children) - 1, -1, -1):
                child = children[i]
                try:
                    if ''.join(child.itertext()).find(title) >= 0:
                        node = child
                        break
                except:
                    pass

            if node is not None:
                elem = node
            else:
                break

        # elem is the smallest element containing the title
        length = len(''.join(elem.itertext()).strip())
        while True:
            node = elem.getparent()
            if node is not None:
                parent_length = len(''.join(node.itertext()).strip())
                if parent_length >= length * 3:
                    top_node = node
                    break
            else:
                top_node = elem
                break

            elem = node

    content = escape_pat.sub(' ', article.format_top_node(top_node, title))

    # Try to identify date
    index = get_content_index(title, content)
    article_title = title
    article_text = content[index:].strip()
    article_content = article_text

    if not article_text:
        at = content[:index].strip()
        if at:
            article_text = at
        else:
            article_text = title
        article_title = ''

    tree = cleaner.clean_html(lxml.html.parse(StringIO(article.html)))
    html_text = ' '.join(tree.getroot().itertext())
    html_text = remove_blanks(html_text)
    filtered_article_title = remove_blanks(article_title)
    filtered_article_text = remove_blanks(article_text)

    title_start_idx = -1
    title_end_idx = -1
    content_idx = -1
    res_date = None
    if filtered_article_text:
        x = find_near_matches(filtered_article_text, html_text, max_l_dist=2)
        if x:
            content_idx = x[0].start
            if filtered_article_title:
                y = find_near_matches(filtered_article_title,
                                      html_text[:content_idx],
                                      max_l_dist=2)
                if y:
                    title_start_idx = y[-1].start
                    title_end_idx = y[-1].end
        else:
            title_lines = article_title.split('\n')
            article_lines = article_text.split('\n')

            end = len(html_text)
            nxt = end
            for i in range(len(article_lines) - 1, -1, -1):
                ln = remove_blanks(article_lines[i])
                if len(ln) >= 10:
                    x = find_near_matches(ln, html_text[:end], max_l_dist=2)
                    if x:
                        end = x[-1].start
                        nxt = x[-1].end
                    else:
                        break

            content_idx = end

            for i in range(len(title_lines) - 1, -1, -1):
                ln = remove_blanks(title_lines[i])
                if len(ln) >= 10:
                    x = find_near_matches(ln, html_text[:end], max_l_dist=2)
                    if x:
                        end = x[-1].start
                        nxt = x[-1].end
                    else:
                        break

            title_start_idx = end
            title_end_idx = nxt

        if title_start_idx >= 0:
            text_between = html_text[title_end_idx:content_idx]
            dates_between = list(article_date_pat.finditer(text_between))
            res_date, res_date_verified, distance = select_date_from_candidates(
                dates_between, -1, text_between)

        if not (res_date and res_date_verified):
            if title_start_idx >= 0:
                search_end = title_start_idx
            else:
                search_end = content_idx

            text_before = html_text[:search_end]
            dates_before = list(article_date_pat.finditer(text_before))
            res_before, res_before_verified, dist_before = select_date_from_candidates(
                dates_before, -1, text_before)

            text_after = html_text[content_idx:]
            dates_after = list(article_date_pat.finditer(text_after))
            res_after, res_after_verified, dist_after = select_date_from_candidates(
                dates_after, 1, text_after)

            if res_before:
                if res_after:
                    if res_before_verified and not res_after_verified:
                        res_date = res_before
                        res_date_verified = True
                    elif res_after_verified and not res_before_verified:
                        res_date = res_after
                        res_date_verified = True
                    elif dist_before <= dist_after:
                        if res_before_verified or not res_date:
                            res_date = res_before
                            res_date_verified = res_before_verified
                    else:
                        if res_after_verified or not res_date:
                            res_date = res_after
                            res_date_verified = res_after_verified
                else:
                    if res_before_verified or not res_date:
                        res_date = res_before
                        res_date_verified = res_before_verified
            elif res_after:
                if res_after_verified or not res_date:
                    res_date = res_after
                    res_date_verified = res_after_verified

    extracted_date = extractArticlePublishedDate(url, html=article.html)

    # Date from meta tags
    if article.publish_date:
        if not (res_date and res_date.date() == article.publish_date.date()):
            res_date = article.publish_date
    # Date from URL
    elif extracted_date and not (res_date and res_date_verified):
        res_date = extracted_date

    return (title, meta_title, h1_title, article_content, res_date,
            extracted_date, article.publish_date)