def recommend_by_url(url): parsed = urlparse(url) doc = Document(requests.get(url).content) content = html.fromstring(doc.content()).xpath('string()') bigrams = make_bigrams(content) vec_bow = dictionary.doc2bow(bigrams) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] #print sims docs = sorted(list(enumerate(sims)), key=lambda item: -item[1]) results, seen = [], [] for doc, score in docs: res = ARTICLES[doc] if not 'url' in res or res['url'] in seen: continue seen.append(res['url']) p = urlparse(res['url']) if p.hostname.endswith(parsed.hostname): continue res['score'] = float(score) if 'content' in res: del res['content'] if 'html' in res: del res['html'] if res['summary']: res['summary'] = res['summary'].strip() results.append(res) if len(results) > 14: break return results
def process_html(html): doc = Document(html) return { 'content': doc.content(), 'clean_html': doc.get_clean_html(), 'short_title': doc.short_title(), 'summary': html_to_text(doc.summary()), 'title': doc.title() }
def get_article(d): url = d['url'] if table.find_one(url=url): return print "fetching stuff for %s" % url d['html'] = requests.get(url).content try: doc = Document(d['html']) d['summary'] = html.fromstring(doc.summary()).xpath('string()') d['content'] = html.fromstring(doc.content()).xpath('string()') d['title'] = doc.title() except Exception, e: print e
def preprocess_doc(html_text): """ Preprocessing of an html text as a String is done here. Tags that are advertisement and that do not describe the content are removed at first. The encoding is detected and next the html is parsed and preprocessed using the readability-lxml Document class to clean the content (text and images embedded in the text). An HTML string is returned together with the title of the website. :author: Sebastian :param html_text: html document in string format to preprocess. :returns: The preprocessed html as a String and the title if needed by the callee. """ # remove some common advertisement tags beforehand bs = BeautifulSoup(html_text, "lxml") for tag_desc in negative_tags: for tag in bs.findAll( attrs={'class': re.compile(r".*\b{}\b.*".format(tag_desc))}): tag.extract() doc = Document(str(bs.html), negative_keywords=negative_classes, positive_keywords=positive_classes) try: # Detect the encoding of the html, if not detectable use utf-8 as default. encoding = chardet.detect(doc.content().encode()).get('encoding') title = doc.title() except TypeError or IndexError as e: logger("Encountered {} setting encoding to utf-8.".format(str(e))) encoding = "utf-8" title = bs.title.getText() if not encoding: logger("Using default encoding utf-8") encoding = 'utf-8' title = bs.title.getText() doc.encoding = encoding head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1' \ '-transitional.dtd">\n' + '<head>\n' + \ '<meta http-equiv="Content-Type" content="text/html" ' \ 'charset="' + encoding + '">\n' + '</head>\n' + '<body>\n' \ + '<h1>' + title.split(sep='|')[0] + '</h1>' # Unparsable Type Error in encoding, where's the problem. text = head + doc.summary()[12:] # sometimes some tags get messed up and need to be translated back text = text.replace("<", "<").replace(">", ">") logger( 'Preprocessing done. Type of text is: {}, Length of test is {}'.format( type(text), len(text))) return text, title
def make_readable(url): try: html = urllib2.urlopen(url).read() except urllib2.URLError: return None document = Document(html) document_dict = { 'title': document.title(), 'summary': document.summary(), 'content': document.content(), 'short_title': document.short_title() } return document_dict
def extract(self, content): doc = Document(content) #Just to make sure it's welform try: body = doc.content() except: return None #It does not have content. Just ignore #Extract title try: title = doc.short_title() except: title = "" if title is "": return None #Extract article try: article = doc.summary() except: article = "" if article is "": return None #Need to clean HTML tag from article article = self.p.sub(' ', article) #Extract date ''' Step to get correct date: 1. Get from URL 2. Get from Metadata 3. Get from first body ''' ''' #2. Get from metadata metas = self.p2.findall(content) date2_candidates = [] #Get content= from meta for meta in metas: p2s = re.compile(r'content="(.*?)"') met = p2s.search(meta) if met is None: continue if met.group(1) is None: continue try: date = dateutil.parser.parse(met.group(1)) date2_candidates.append((date, len(met.group(1)))) except ValueError: pass except TypeError: pass ''' #3. Get from first body body = self.pb1.sub('', body) body = self.pb2.sub('', body) body = self.pb3.sub('', body) body = self.pb4.sub(' ', body) body = " ".join(body.split()) title_fixed_whitespaces = " ".join(title.split()) title_fixed_whitespaces = title_fixed_whitespaces[:50] mid_point = body.find(title_fixed_whitespaces) ''' Little trick for WordPress title ''' if mid_point == -1: title_fixed_whitespaces = title_fixed_whitespaces.replace( "'", "’") mid_point = body.find(title_fixed_whitespaces) #start_point start_point = mid_point - TITLE_THRESHOLD if start_point < 0: start_point = 0 #end_point end_point = mid_point + len(title_fixed_whitespaces) + TITLE_THRESHOLD if end_point > len(body): end_point = len(body) #Find the date max_length = 0 date3 = None date3_candidates = find_dates(body[start_point:end_point]) for dat3 in date3_candidates: if dat3[1] > max_length: date3 = dat3[0] #4. Select the best date date = date3 #Return a tuple return (title, date, article)