Beispiel #1
0
def recommend_by_url(url):
    parsed = urlparse(url)
    doc = Document(requests.get(url).content)
    content = html.fromstring(doc.content()).xpath('string()')
    bigrams = make_bigrams(content)
    vec_bow = dictionary.doc2bow(bigrams)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    #print sims
    docs = sorted(list(enumerate(sims)), key=lambda item: -item[1])
    results, seen = [], []
    for doc, score in docs:
        res = ARTICLES[doc]
        if not 'url' in res or res['url'] in seen:
            continue
        seen.append(res['url'])
        p = urlparse(res['url'])
        if p.hostname.endswith(parsed.hostname):
            continue
        res['score'] = float(score)
        if 'content' in res:
            del res['content']
        if 'html' in res:
            del res['html']
        if res['summary']:
            res['summary'] = res['summary'].strip()
        results.append(res)
        if len(results) > 14:
            break
    return results
Beispiel #2
0
def process_html(html):
    doc = Document(html)
    return {
        'content': doc.content(),
        'clean_html': doc.get_clean_html(),
        'short_title': doc.short_title(),
        'summary': html_to_text(doc.summary()),
        'title': doc.title()
    }
Beispiel #3
0
def get_article(d):
    url = d['url']
    if table.find_one(url=url):
        return
    print "fetching stuff for %s" % url
    d['html'] = requests.get(url).content
    try:
        doc = Document(d['html'])
        d['summary'] = html.fromstring(doc.summary()).xpath('string()')
        d['content'] = html.fromstring(doc.content()).xpath('string()')
        d['title'] = doc.title()
    except Exception, e:
        print e
Beispiel #4
0
def preprocess_doc(html_text):
    """
    Preprocessing of an html text as a String is done here. Tags that are advertisement and that do not describe the
    content are removed at first. The encoding is detected and next the html is parsed and preprocessed using the
    readability-lxml Document class to clean the content (text and images embedded in the text).
    An HTML string is returned together with the title of the website.

    :author: Sebastian
    :param html_text: html document in string format to preprocess.
    :returns: The preprocessed html as a String and the title if needed by the callee.
    """
    # remove some common advertisement tags beforehand
    bs = BeautifulSoup(html_text, "lxml")
    for tag_desc in negative_tags:
        for tag in bs.findAll(
                attrs={'class': re.compile(r".*\b{}\b.*".format(tag_desc))}):
            tag.extract()
    doc = Document(str(bs.html),
                   negative_keywords=negative_classes,
                   positive_keywords=positive_classes)
    try:
        # Detect the encoding of the html, if not detectable use utf-8 as default.
        encoding = chardet.detect(doc.content().encode()).get('encoding')
        title = doc.title()
    except TypeError or IndexError as e:
        logger("Encountered {} setting encoding to utf-8.".format(str(e)))
        encoding = "utf-8"
        title = bs.title.getText()
    if not encoding:
        logger("Using default encoding utf-8")
        encoding = 'utf-8'
        title = bs.title.getText()
    doc.encoding = encoding

    head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1' \
           '-transitional.dtd">\n' + '<head>\n' + \
           '<meta http-equiv="Content-Type" content="text/html" ' \
           'charset="' + encoding + '">\n' + '</head>\n' + '<body>\n' \
           + '<h1>' + title.split(sep='|')[0] + '</h1>'

    # Unparsable Type Error in encoding, where's the problem.
    text = head + doc.summary()[12:]

    # sometimes some tags get messed up and need to be translated back
    text = text.replace("&lt;", "<").replace("&gt;", ">")
    logger(
        'Preprocessing done. Type of text is: {}, Length of test is {}'.format(
            type(text), len(text)))
    return text, title
Beispiel #5
0
def make_readable(url):
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError:
        return None

    document = Document(html)

    document_dict = {
        'title': document.title(),
        'summary': document.summary(),
        'content': document.content(),
        'short_title': document.short_title()
    }

    return document_dict
Beispiel #6
0
    def extract(self, content):
        doc = Document(content)

        #Just to make sure it's welform
        try:
            body = doc.content()
        except:
            return None
            #It does not have content. Just ignore

        #Extract title
        try:
            title = doc.short_title()
        except:
            title = ""
        if title is "":
            return None

        #Extract article
        try:
            article = doc.summary()
        except:
            article = ""
        if article is "":
            return None

        #Need to clean HTML tag from article
        article = self.p.sub(' ', article)

        #Extract date
        '''
        Step to get correct date:
        1. Get from URL
        2. Get from Metadata
        3. Get from first body
        '''
        '''
        #2. Get from metadata
        metas = self.p2.findall(content)
        date2_candidates = []
        #Get content= from meta
        for meta in metas:
            p2s = re.compile(r'content="(.*?)"')
            met = p2s.search(meta)
            if met is None: continue
            if met.group(1) is None: continue
            try:
                date = dateutil.parser.parse(met.group(1))
                date2_candidates.append((date, len(met.group(1))))
            except ValueError:
                pass
            except TypeError:
                pass
        '''
        #3. Get from first body
        body = self.pb1.sub('', body)
        body = self.pb2.sub('', body)
        body = self.pb3.sub('', body)
        body = self.pb4.sub(' ', body)
        body = " ".join(body.split())
        title_fixed_whitespaces = " ".join(title.split())
        title_fixed_whitespaces = title_fixed_whitespaces[:50]
        mid_point = body.find(title_fixed_whitespaces)
        '''
        Little trick for WordPress title
        '''
        if mid_point == -1:
            title_fixed_whitespaces = title_fixed_whitespaces.replace(
                "'", "&#8217;")
            mid_point = body.find(title_fixed_whitespaces)

        #start_point
        start_point = mid_point - TITLE_THRESHOLD
        if start_point < 0:
            start_point = 0
        #end_point
        end_point = mid_point + len(title_fixed_whitespaces) + TITLE_THRESHOLD
        if end_point > len(body):
            end_point = len(body)
        #Find the date
        max_length = 0
        date3 = None
        date3_candidates = find_dates(body[start_point:end_point])
        for dat3 in date3_candidates:
            if dat3[1] > max_length:
                date3 = dat3[0]

        #4. Select the best date
        date = date3

        #Return a tuple
        return (title, date, article)