Python Document.content Beispiele

Programmiersprache: Python

Namespace / Paketname: readability.readability

Klasse / Typ: Document

Methode / Funktion: content

Beispiele auf hotexamples.com: 6

Python Document.content - 6 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die readability.readability.Document.content, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Document(30)

short_title(30)

summary(30)

title(20)

encode(9)

replace(9)

reverse_tags(4)

content(3)

transform(2)

get_clean_html(2)

get_publish_date(2)

parse(2)

split(2)

text_content(1)

summary_with_metadata(1)

strip(1)

read(1)

seek(1)

lower(1)

get_text(1)

get_author(1)

find_all(1)

find(1)

encoding(1)

write(1)

Beispiel #1

Datei anzeigen

Datei: recommend.py Projekt: pudo-attic/newshacks

def recommend_by_url(url):
    parsed = urlparse(url)
    doc = Document(requests.get(url).content)
    content = html.fromstring(doc.content()).xpath('string()')
    bigrams = make_bigrams(content)
    vec_bow = dictionary.doc2bow(bigrams)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    #print sims
    docs = sorted(list(enumerate(sims)), key=lambda item: -item[1])
    results, seen = [], []
    for doc, score in docs:
        res = ARTICLES[doc]
        if not 'url' in res or res['url'] in seen:
            continue
        seen.append(res['url'])
        p = urlparse(res['url'])
        if p.hostname.endswith(parsed.hostname):
            continue
        res['score'] = float(score)
        if 'content' in res:
            del res['content']
        if 'html' in res:
            del res['html']
        if res['summary']:
            res['summary'] = res['summary'].strip()
        results.append(res)
        if len(results) > 14:
            break
    return results

Beispiel #2

Datei anzeigen

def process_html(html):
    doc = Document(html)
    return {
        'content': doc.content(),
        'clean_html': doc.get_clean_html(),
        'short_title': doc.short_title(),
        'summary': html_to_text(doc.summary()),
        'title': doc.title()
    }

Beispiel #3

Datei anzeigen

Datei: scraper.py Projekt: pudo-attic/newshacks

def get_article(d):
    url = d['url']
    if table.find_one(url=url):
        return
    print "fetching stuff for %s" % url
    d['html'] = requests.get(url).content
    try:
        doc = Document(d['html'])
        d['summary'] = html.fromstring(doc.summary()).xpath('string()')
        d['content'] = html.fromstring(doc.content()).xpath('string()')
        d['title'] = doc.title()
    except Exception, e:
        print e

Beispiel #4

Datei anzeigen

def preprocess_doc(html_text):
    """
    Preprocessing of an html text as a String is done here. Tags that are advertisement and that do not describe the
    content are removed at first. The encoding is detected and next the html is parsed and preprocessed using the
    readability-lxml Document class to clean the content (text and images embedded in the text).
    An HTML string is returned together with the title of the website.

    :author: Sebastian
    :param html_text: html document in string format to preprocess.
    :returns: The preprocessed html as a String and the title if needed by the callee.
    """
    # remove some common advertisement tags beforehand
    bs = BeautifulSoup(html_text, "lxml")
    for tag_desc in negative_tags:
        for tag in bs.findAll(
                attrs={'class': re.compile(r".*\b{}\b.*".format(tag_desc))}):
            tag.extract()
    doc = Document(str(bs.html),
                   negative_keywords=negative_classes,
                   positive_keywords=positive_classes)
    try:
        # Detect the encoding of the html, if not detectable use utf-8 as default.
        encoding = chardet.detect(doc.content().encode()).get('encoding')
        title = doc.title()
    except TypeError or IndexError as e:
        logger("Encountered {} setting encoding to utf-8.".format(str(e)))
        encoding = "utf-8"
        title = bs.title.getText()
    if not encoding:
        logger("Using default encoding utf-8")
        encoding = 'utf-8'
        title = bs.title.getText()
    doc.encoding = encoding

    head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1' \
           '-transitional.dtd">\n' + '<head>\n' + \
           '<meta http-equiv="Content-Type" content="text/html" ' \
           'charset="' + encoding + '">\n' + '</head>\n' + '<body>\n' \
           + '<h1>' + title.split(sep='|')[0] + '</h1>'

    # Unparsable Type Error in encoding, where's the problem.
    text = head + doc.summary()[12:]

    # sometimes some tags get messed up and need to be translated back
    text = text.replace("&lt;", "<").replace("&gt;", ">")
    logger(
        'Preprocessing done. Type of text is: {}, Length of test is {}'.format(
            type(text), len(text)))
    return text, title

Beispiel #5

Datei anzeigen

Datei: utils.py Projekt: joshgoss/reader-server

def make_readable(url):
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError:
        return None

    document = Document(html)

    document_dict = {
        'title': document.title(),
        'summary': document.summary(),
        'content': document.content(),
        'short_title': document.short_title()
    }

    return document_dict

Beispiel #6

Datei anzeigen

    def extract(self, content):
        doc = Document(content)

        #Just to make sure it's welform
        try:
            body = doc.content()
        except:
            return None
            #It does not have content. Just ignore

        #Extract title
        try:
            title = doc.short_title()
        except:
            title = ""
        if title is "":
            return None

        #Extract article
        try:
            article = doc.summary()
        except:
            article = ""
        if article is "":
            return None

        #Need to clean HTML tag from article
        article = self.p.sub(' ', article)

        #Extract date
        '''
        Step to get correct date:
        1. Get from URL
        2. Get from Metadata
        3. Get from first body
        '''
        '''
        #2. Get from metadata
        metas = self.p2.findall(content)
        date2_candidates = []
        #Get content= from meta
        for meta in metas:
            p2s = re.compile(r'content="(.*?)"')
            met = p2s.search(meta)
            if met is None: continue
            if met.group(1) is None: continue
            try:
                date = dateutil.parser.parse(met.group(1))
                date2_candidates.append((date, len(met.group(1))))
            except ValueError:
                pass
            except TypeError:
                pass
        '''
        #3. Get from first body
        body = self.pb1.sub('', body)
        body = self.pb2.sub('', body)
        body = self.pb3.sub('', body)
        body = self.pb4.sub(' ', body)
        body = " ".join(body.split())
        title_fixed_whitespaces = " ".join(title.split())
        title_fixed_whitespaces = title_fixed_whitespaces[:50]
        mid_point = body.find(title_fixed_whitespaces)
        '''
        Little trick for WordPress title
        '''
        if mid_point == -1:
            title_fixed_whitespaces = title_fixed_whitespaces.replace(
                "'", "&#8217;")
            mid_point = body.find(title_fixed_whitespaces)

        #start_point
        start_point = mid_point - TITLE_THRESHOLD
        if start_point < 0:
            start_point = 0
        #end_point
        end_point = mid_point + len(title_fixed_whitespaces) + TITLE_THRESHOLD
        if end_point > len(body):
            end_point = len(body)
        #Find the date
        max_length = 0
        date3 = None
        date3_candidates = find_dates(body[start_point:end_point])
        for dat3 in date3_candidates:
            if dat3[1] > max_length:
                date3 = dat3[0]

        #4. Select the best date
        date = date3

        #Return a tuple
        return (title, date, article)