Ejemplo n.º 1
0
def get_heading_feature(content_id=0, content_data='', html_data=''):
    """
    status: optional
    """
    soup_data = ''

    #get data
    if content_id:
        html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #extract h tags and features
    heading_data = get_heading_word(soup_data)
    heading_dict = text_analysis(heading_data, var_name='heading')

    return heading_dict
Ejemplo n.º 2
0
def get_heading_feature(content_id=0, content_data='', html_data=''):
    """
    status: optional
    """
    soup_data = ''

    #get data
    if content_id:
        html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #extract h tags and features
    heading_data = get_heading_word(soup_data)
    heading_dict = text_analysis(heading_data, var_name='heading')

    return heading_dict
Ejemplo n.º 3
0
def get_text(content_id):
    raw_html = Content.get_raw_html_by_id(content_id)
    try:
        text = Extractor(extractor='ArticleExtractor', html=raw_html).getText()
    except Exception as e:
        logger.exception('\nError extracting text from html. Exception: %s, %s',
                         e.__class__.__name__, e)
        return ''
    return text
Ejemplo n.º 4
0
def get_text(content_id):
    raw_html = Content.get_raw_html_by_id(content_id)
    try:
        text = Extractor(extractor='ArticleExtractor', html=raw_html).getText()
    except Exception as e:
        logger.exception(
            '\nError extracting text from html. Exception: %s, %s',
            e.__class__.__name__, e)
        return ''
    return text
Ejemplo n.º 5
0
def get_anchor_feature(content_id=0, content_data='', html_data=''):
    """
    a tags in p tags
    @status: optional
    @param content_data: BeautifulSoup object
    @param html_data: html string
    @todo:
    """
    soup_data = ''
    anchor_text = []
    anchor_link = []

    #get data
    if content_id:
        raw_html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(raw_html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #get a tags links and text
    anchor_data = get_anchor(soup_data)
    for a in anchor_data:
        if a.string:
            anchor_text += a.string.split()
            if a.has_attr('href'):
                anchor_link.append(a['href'])

    #run thru text analysis even with empty soup cause of the loops(fixed bug)
    if not anchor_link:
        anchor_link = ['']

    #extract text features
    anchor_text = ' '.join(anchor_text)
    text_result = text_analysis(anchor_text, var_name='anchor')

    #extract link features
    url_list = []
    for link in anchor_link:
        url_list.append(url_analysis(link, var_name='anchor'))
    url_result = sum_list_dict(url_list, 'anchor')

    anchor_dict = dict(text_result.items() +
                       url_result.items())

    return anchor_dict
Ejemplo n.º 6
0
def get_anchor_feature(content_id=0, content_data='', html_data=''):
    """
    a tags in p tags
    @status: optional
    @param content_data: BeautifulSoup object
    @param html_data: html string
    @todo:
    """
    soup_data = ''
    anchor_text = []
    anchor_link = []

    #get data
    if content_id:
        raw_html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(raw_html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #get a tags links and text
    anchor_data = get_anchor(soup_data)
    for a in anchor_data:
        if a.string:
            anchor_text += a.string.split()
            if a.has_attr('href'):
                anchor_link.append(a['href'])

    #run thru text analysis even with empty soup cause of the loops(fixed bug)
    if not anchor_link:
        anchor_link = ['']

    #extract text features
    anchor_text = ' '.join(anchor_text)
    text_result = text_analysis(anchor_text, var_name='anchor')

    #extract link features
    url_list = []
    for link in anchor_link:
        url_list.append(url_analysis(link, var_name='anchor'))
    url_result = sum_list_dict(url_list, 'anchor')

    anchor_dict = dict(text_result.items() + url_result.items())

    return anchor_dict
Ejemplo n.º 7
0
def get_html_feature(content_id=0, content_data='', html_data=''):
    """
    status: required
    @param content_data: BeautifulSoup object
    @param html_data: html string
    
    @TODO: image, video, etc..
    """
    html_dict = {}
    soup = ''

    #get data
    if content_id:
        html_data = Content.get_raw_html_by_id(content_id)
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    elif content_data:
        soup = content_data
    elif html_data:
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    if not soup:
        return {}

    #extract features
    html_dict['html_num'] = get_html_num(soup)
    html_dict['html_h'] = get_html_tags(soup, HTML_HEAD)
    html_dict['html_a'] = get_html_tags(soup, HTML_A)
    html_dict['html_p'] = get_html_tags(soup, HTML_P)
    html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED)
    html_dict['html_style'] = get_html_tags(soup, HTML_STYLE)
    html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT)
    html_dict['html_meta'] = get_html_tags(soup, HTML_META)
    html_dict['html_input'] = get_html_tags(soup, HTML_INPUT)
    html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT)

    return html_dict
Ejemplo n.º 8
0
def get_html_feature(content_id=0, content_data='', html_data=''):
    """
    status: required
    @param content_data: BeautifulSoup object
    @param html_data: html string
    
    @TODO: image, video, etc..
    """
    html_dict = {}
    soup = ''

    #get data
    if content_id:
        html_data = Content.get_raw_html_by_id(content_id)
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    elif content_data:
        soup = content_data
    elif html_data:
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    if not soup:
        return {}

    #extract features
    html_dict['html_num'] = get_html_num(soup)
    html_dict['html_h'] = get_html_tags(soup, HTML_HEAD)
    html_dict['html_a'] = get_html_tags(soup, HTML_A)
    html_dict['html_p'] = get_html_tags(soup, HTML_P)
    html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED)
    html_dict['html_style'] = get_html_tags(soup, HTML_STYLE)
    html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT)
    html_dict['html_meta'] = get_html_tags(soup, HTML_META)
    html_dict['html_input'] = get_html_tags(soup, HTML_INPUT)
    html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT)

    return html_dict