def get_heading_feature(content_id=0, content_data='', html_data=''): """ status: optional """ soup_data = '' #get data if content_id: html = Content.get_raw_html_by_id(content_id) try: soup_data = BeautifulSoup(html) except: pass elif content_data: soup_data = content_data elif html_data: try: soup_data = BeautifulSoup(html_data) except: pass if not soup_data: soup_data = BeautifulSoup('') #extract h tags and features heading_data = get_heading_word(soup_data) heading_dict = text_analysis(heading_data, var_name='heading') return heading_dict
def get_heading_feature(content_id=0, content_data='', html_data=''): """ status: optional """ soup_data = '' #get data if content_id: html = Content.get_raw_html_by_id(content_id) try: soup_data = BeautifulSoup(html) except: pass elif content_data: soup_data = content_data elif html_data: try: soup_data = BeautifulSoup(html_data) except: pass if not soup_data: soup_data = BeautifulSoup('') #extract h tags and features heading_data = get_heading_word(soup_data) heading_dict = text_analysis(heading_data, var_name='heading') return heading_dict
def get_text(content_id): raw_html = Content.get_raw_html_by_id(content_id) try: text = Extractor(extractor='ArticleExtractor', html=raw_html).getText() except Exception as e: logger.exception('\nError extracting text from html. Exception: %s, %s', e.__class__.__name__, e) return '' return text
def get_text(content_id): raw_html = Content.get_raw_html_by_id(content_id) try: text = Extractor(extractor='ArticleExtractor', html=raw_html).getText() except Exception as e: logger.exception( '\nError extracting text from html. Exception: %s, %s', e.__class__.__name__, e) return '' return text
def get_anchor_feature(content_id=0, content_data='', html_data=''): """ a tags in p tags @status: optional @param content_data: BeautifulSoup object @param html_data: html string @todo: """ soup_data = '' anchor_text = [] anchor_link = [] #get data if content_id: raw_html = Content.get_raw_html_by_id(content_id) try: soup_data = BeautifulSoup(raw_html) except: pass elif content_data: soup_data = content_data elif html_data: try: soup_data = BeautifulSoup(html_data) except: pass if not soup_data: soup_data = BeautifulSoup('') #get a tags links and text anchor_data = get_anchor(soup_data) for a in anchor_data: if a.string: anchor_text += a.string.split() if a.has_attr('href'): anchor_link.append(a['href']) #run thru text analysis even with empty soup cause of the loops(fixed bug) if not anchor_link: anchor_link = [''] #extract text features anchor_text = ' '.join(anchor_text) text_result = text_analysis(anchor_text, var_name='anchor') #extract link features url_list = [] for link in anchor_link: url_list.append(url_analysis(link, var_name='anchor')) url_result = sum_list_dict(url_list, 'anchor') anchor_dict = dict(text_result.items() + url_result.items()) return anchor_dict
def get_anchor_feature(content_id=0, content_data='', html_data=''): """ a tags in p tags @status: optional @param content_data: BeautifulSoup object @param html_data: html string @todo: """ soup_data = '' anchor_text = [] anchor_link = [] #get data if content_id: raw_html = Content.get_raw_html_by_id(content_id) try: soup_data = BeautifulSoup(raw_html) except: pass elif content_data: soup_data = content_data elif html_data: try: soup_data = BeautifulSoup(html_data) except: pass if not soup_data: soup_data = BeautifulSoup('') #get a tags links and text anchor_data = get_anchor(soup_data) for a in anchor_data: if a.string: anchor_text += a.string.split() if a.has_attr('href'): anchor_link.append(a['href']) #run thru text analysis even with empty soup cause of the loops(fixed bug) if not anchor_link: anchor_link = [''] #extract text features anchor_text = ' '.join(anchor_text) text_result = text_analysis(anchor_text, var_name='anchor') #extract link features url_list = [] for link in anchor_link: url_list.append(url_analysis(link, var_name='anchor')) url_result = sum_list_dict(url_list, 'anchor') anchor_dict = dict(text_result.items() + url_result.items()) return anchor_dict
def get_html_feature(content_id=0, content_data='', html_data=''): """ status: required @param content_data: BeautifulSoup object @param html_data: html string @TODO: image, video, etc.. """ html_dict = {} soup = '' #get data if content_id: html_data = Content.get_raw_html_by_id(content_id) try: soup = BeautifulSoup(html_data) except: pass elif content_data: soup = content_data elif html_data: try: soup = BeautifulSoup(html_data) except: pass if not soup: return {} #extract features html_dict['html_num'] = get_html_num(soup) html_dict['html_h'] = get_html_tags(soup, HTML_HEAD) html_dict['html_a'] = get_html_tags(soup, HTML_A) html_dict['html_p'] = get_html_tags(soup, HTML_P) html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED) html_dict['html_style'] = get_html_tags(soup, HTML_STYLE) html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT) html_dict['html_meta'] = get_html_tags(soup, HTML_META) html_dict['html_input'] = get_html_tags(soup, HTML_INPUT) html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT) return html_dict
def get_html_feature(content_id=0, content_data='', html_data=''): """ status: required @param content_data: BeautifulSoup object @param html_data: html string @TODO: image, video, etc.. """ html_dict = {} soup = '' #get data if content_id: html_data = Content.get_raw_html_by_id(content_id) try: soup = BeautifulSoup(html_data) except: pass elif content_data: soup = content_data elif html_data: try: soup = BeautifulSoup(html_data) except: pass if not soup: return {} #extract features html_dict['html_num'] = get_html_num(soup) html_dict['html_h'] = get_html_tags(soup, HTML_HEAD) html_dict['html_a'] = get_html_tags(soup, HTML_A) html_dict['html_p'] = get_html_tags(soup, HTML_P) html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED) html_dict['html_style'] = get_html_tags(soup, HTML_STYLE) html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT) html_dict['html_meta'] = get_html_tags(soup, HTML_META) html_dict['html_input'] = get_html_tags(soup, HTML_INPUT) html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT) return html_dict