Example #1
0
def newspaper_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    top_node = extractor.calculate_best_node(doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'newspaper3k',
        }
Example #2
0
def newspaper_fulltext2(parser, language, url):
    '''
    This is a faster version of the function that uses some internal newspaper3k functions
    so that the lxml parse tree doesn't need to be recreated.
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    doc = extractor.calculate_best_node(doc)
    if doc is not None:
        doc = extractor.post_cleanup(doc)
        text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value': {
            'text': text,
            'html': html,
        },
        'pattern': 'newspaper3k',
    }
Example #3
0
 def __init__(self):
     self.config = Configuration(
     )  # sets meta config for article and parser
     self.parser = self.config.get_parser()  # parser
     self.extractor = ContentExtractor(
         self.config
     )  # extracts info (author, tags, text, etc.) from parsed article
     self.doc_cleaner = DocumentCleaner(
         self.config)  # cleans unwanted tags and nodes from DOM
     self.formatter = OutputFormatter(
         self.config)  # outputs formatted text from parsed xpath nodes
Example #4
0
def html_to_article(content, language):
    content = content.strip()
    if not len(content):
        return ''

    config = NewspaperConfig()
    config.language = language

    doc = config.get_parser().fromstring(content.strip())
    if doc is None:
        return ''

    # Split block-level elements with newlines
    for tag in _BLOCKLEVEL_TAGS:
        if tag in _MEANINGLESS_TAGS:
            continue
        for node in doc.xpath('//{}'.format(tag)):
            node.append(etree.Element('br'))
            node.append(etree.Element('br'))

    # Initial cleanup
    cleaner = _NewspaperCleaner(config)
    doc = cleaner.clean(doc)

    # Best node estimation
    extractor = NewspaperExtractor(config)
    top = extractor.calculate_best_node(doc)
    if top is None:
        del doc, cleaner, extractor
        etree.clear_error_log()

        return ''

    top = extractor.post_cleanup(top)

    # Cleanup dummy nodes used for estimation
    for dummy in top.xpath("//p[@newspaper='dummy']"):
        dummy.getparent().remove(dummy)

    # Custom formatting to avoid unnecessary computations
    formatter = NewspaperFormatter(config)
    formatter.top_node = top
    formatter.remove_negativescores_nodes()
    content = formatter.convert_to_html()
    content = str(content).strip()
    content = unescape(content)

    del doc, top, cleaner, extractor, formatter
    etree.clear_error_log()

    return content
Example #5
0
def fulltext(html, language='en'):
    """Takes article HTML string input and outputs the fulltext
    Input string is decoded via UnicodeDammit if needed
    """

    config = Configuration()
    config.language = language

    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = WithTagOutputFormatter(config)

    doc = config.get_parser().fromstring(html)
    doc = document_cleaner.clean(doc)

    top_node = extractor.calculate_best_node(doc)

    top_node = extractor.post_cleanup(top_node)
    text, article_html = output_formatter.get_formatted(top_node)
    return text, article_html
Example #6
0
def ProcessArticle(urlStr, domain, htmlStr, cursor):
    config = Configuration()
    extractor = ContentExtractor(config)
    clean_doc = config.get_parser().fromstring(htmlStr)
    title = extractor.get_title(clean_doc)
    authors = extractor.get_authors(clean_doc)
    text = fulltext(htmlStr)

    text_keyws = list(nlp.keywords(text).keys())
    title_keyws = list(nlp.keywords(title).keys())

    keyws = list(set(title_keyws + text_keyws))
    summary_sents = nlp.summarize(title=title,
                                  text=text,
                                  max_sents=config.MAX_SUMMARY_SENT)
    summary = '\n'.join(summary_sents)

    if len(text) == 0:
        OnArticleProcessError(urlStr)
    else:
        StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary,
                        cursor)
def modified_fulltext(parser, language, url):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    url_parsed = urlparse(url)

    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)

    doc = parser
    doc = rm_ads(doc,url_parsed.hostname)
    doc = clean(document_cleaner,doc)
    #doc = document_cleaner.clean(doc)
    doc = calculate_best_node(extractor,doc)
    #doc = extractor.calculate_best_node(doc)
    if doc is not None:
        #doc = extractor.add_siblings(doc)
        doc = post_cleanup(doc)
        #doc = extractor.post_cleanup(doc)
        text, html = get_formatted(doc)
        #text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }
def get_data_from_html(html):
    result = {}
    parsed_html = Parser.fromstring(html)

    config = Configuration()
    extractor = ContentExtractor(config)
    formatter = OutputFormatter(config)
    cleaner = DocumentCleaner(config)

    result['title'] = extractor.get_title(parsed_html)

    publishing_date = extractor.get_publishing_date('', parsed_html)
    if publishing_date is None:
        publishing_date = datetime.datetime.now()

    result['published_at'] = publishing_date.isoformat()

    cleaned_html = cleaner.clean(parsed_html)
    top_node = extractor.calculate_best_node(cleaned_html)
    top_node = extractor.post_cleanup(top_node)
    result['content'], _ = formatter.get_formatted(top_node)

    return result
Example #9
0
class ArticleExtractionPipeline(object):
    def __init__(self):
        self.config = Configuration(
        )  # sets meta config for article and parser
        self.parser = self.config.get_parser()  # parser
        self.extractor = ContentExtractor(
            self.config
        )  # extracts info (author, tags, text, etc.) from parsed article
        self.doc_cleaner = DocumentCleaner(
            self.config)  # cleans unwanted tags and nodes from DOM
        self.formatter = OutputFormatter(
            self.config)  # outputs formatted text from parsed xpath nodes

    # right now basically only works for RT
    # params: doc is parsed html from self.parser
    def find_date_from_html(self, doc):
        # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
        candidates = self.parser.getElementsByTag(doc, tag="time")  # add more
        times = []
        for candidate in candidates:
            time_string = candidate.text
            for indicator in ["Edited", "Updated", "Published"]:
                if indicator in time_string:
                    # indicator probably followed by "at" or ":", actual time is after that
                    if "at" in time_string:
                        time_string = time_string.split("at", 1)[1]
                    elif ":" in time_string:
                        time_string = time_string.split(":", 1)[1]
                    break
            time = self.datetime_from_str(time_string)
            if time:
                times.append(time)
        if times:
            return min(times)
        else:
            return None

    def datetime_from_str(self, datetime_string):
        try:
            return date_parser.parse(datetime_string).replace(
                tzinfo=None
            )  # otherwise can't compare naive and (timezone) offset-aware times
        except (ValueError, OverflowError, AttributeError, TypeError):
            return None

    # params: doc is parsed html from self.parser
    # TODO: generalize
    def get_date(self, url, doc):
        raw_date = (
            self.extractor.get_publishing_date(url, doc)
            or  # telesur, africanews
            self.extractor.get_meta_content(doc,
                                            "meta[name='LastModifiedDate']")
            or  # aljazeera, Sun, 07 January 2018 18:36:49 GMT
            self.extractor.get_meta_content(doc, "meta[name='Last-Modified']")
            or  # times of india, Jan 9, 2018, 05:18 IST
            self.extractor.get_meta_content(
                doc, "meta[property='og:updated_time']")
        )  # diplomat, "2018-01-05 23:22:46"
        if raw_date:
            return self.datetime_from_str(raw_date)
        else:
            return self.find_date_from_html(doc)

    # params: date is datetime object
    def recent_article(self, date, max_days_elapsed=3):
        return datetime.datetime.now() - date < datetime.timedelta(
            days=max_days_elapsed)

    def process_item(self, item, spider):
        doc = self.parser.fromstring(item["content"])

        item["title"] = self.extractor.get_title(doc)
        item["description"] = self.extractor.get_meta_description(doc)
        item["keywords"] = (self.extractor.get_meta_content(
            doc, "meta[name='news_keywords']")
                            or self.extractor.get_meta_keywords(doc))
        item["date"] = self.get_date(item["url"], doc)

        # drop item if no date
        if not item["date"] or not self.recent_article(
                item["date"], max_days_elapsed=7
        ):  # or not self.recent_article(item["date"])
            raise DropItem("Missing or invalid date for: {}".format(
                item["title"]))

        # clean:
        clean_doc = self.doc_cleaner.clean(doc)
        top_node = self.extractor.post_cleanup(
            self.extractor.calculate_best_node(clean_doc))
        item["content"] = self.formatter.get_formatted(top_node)[
            0]  # [1] returns html of article

        # drop item if article too short
        if len(item["content"]) < 600:
            raise DropItem("Not enough text: {}".format(item["title"]))

        logging.info("ARTICLE TITLE: {}".format(item["title"]))
        logging.info("\t time: {}".format(item["date"]))
        return item
Example #10
0
def modified_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    def calculate_best_node(self, doc):
        top_node = None
        cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)')
        #nodes_to_check = self.nodes_to_check(doc)
        starting_boost = float(1.0)
        #cnt = 0
        #i = 0
        parent_nodes = []
        nodes_with_text = []

        #for node in nodes_to_check:
        for node in cxpath_body_nodes(doc):
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        #for node in nodes_with_text:
        for i,node in enumerate(nodes_with_text):
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                #if cnt >= 0:
                if i >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(
                        bottom_negativescore_nodes - (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            parent_node = self.parser.getParent(node)
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            # Parent of parent node
            parent_parent_node = self.parser.getParent(parent_node)
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            #cnt += 1
            #i += 1

        top_node_score = 0
        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e
        return top_node

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    #doc = document_cleaner.clean(doc)
    top_node = calculate_best_node(extractor,doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }