Ejemplos de Content.get_raw_html_by_id en Python

Lenguaje de programación: Python

Namespace/Package Name: app.models.Content

Clase / Tipo: Content

Método / Función: get_raw_html_by_id

Ejemplos en hotexamples.com: 8

Python Content.get_raw_html_by_id - 8 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de app.models.Content.Content.get_raw_html_by_id extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

get_content_by_id(9)

get_raw_html_by_id(4)

create_or_update_content(2)

get_top_for_query(1)

type(1)

title(1)

timestamp(1)

tags(1)

site_name(1)

image_url(1)

icon_url(1)

get_unranked_contents_by_age(1)

get_top_unviewed(1)

get_top_type_filtered(1)

get_top_tag_filtered(1)

Content(1)

get_top_by_pages(1)

content_source_id(1)

get_content_no_real_shares_by_age(1)

get_content_for_ranking(1)

get_content_for_clustering(1)

get_content_by_url(1)

get_content_by_parent_cluster(1)

get_content_by_link(1)

get_content_by_feed_id(1)

feed_id(1)

feature_extraction(1)

description(1)

url(1)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_heading_feature(content_id=0, content_data='', html_data=''):
    """
    status: optional
    """
    soup_data = ''

    #get data
    if content_id:
        html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #extract h tags and features
    heading_data = get_heading_word(soup_data)
    heading_dict = text_analysis(heading_data, var_name='heading')

    return heading_dict

Ejemplo n.º 2

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_heading_feature(content_id=0, content_data='', html_data=''):
    """
    status: optional
    """
    soup_data = ''

    #get data
    if content_id:
        html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #extract h tags and features
    heading_data = get_heading_word(soup_data)
    heading_dict = text_analysis(heading_data, var_name='heading')

    return heading_dict

Ejemplo n.º 3

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_text(content_id):
    raw_html = Content.get_raw_html_by_id(content_id)
    try:
        text = Extractor(extractor='ArticleExtractor', html=raw_html).getText()
    except Exception as e:
        logger.exception('\nError extracting text from html. Exception: %s, %s',
                         e.__class__.__name__, e)
        return ''
    return text

Ejemplo n.º 4

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_text(content_id):
    raw_html = Content.get_raw_html_by_id(content_id)
    try:
        text = Extractor(extractor='ArticleExtractor', html=raw_html).getText()
    except Exception as e:
        logger.exception(
            '\nError extracting text from html. Exception: %s, %s',
            e.__class__.__name__, e)
        return ''
    return text

Ejemplo n.º 5

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_anchor_feature(content_id=0, content_data='', html_data=''):
    """
    a tags in p tags
    @status: optional
    @param content_data: BeautifulSoup object
    @param html_data: html string
    @todo:
    """
    soup_data = ''
    anchor_text = []
    anchor_link = []

    #get data
    if content_id:
        raw_html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(raw_html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #get a tags links and text
    anchor_data = get_anchor(soup_data)
    for a in anchor_data:
        if a.string:
            anchor_text += a.string.split()
            if a.has_attr('href'):
                anchor_link.append(a['href'])

    #run thru text analysis even with empty soup cause of the loops(fixed bug)
    if not anchor_link:
        anchor_link = ['']

    #extract text features
    anchor_text = ' '.join(anchor_text)
    text_result = text_analysis(anchor_text, var_name='anchor')

    #extract link features
    url_list = []
    for link in anchor_link:
        url_list.append(url_analysis(link, var_name='anchor'))
    url_result = sum_list_dict(url_list, 'anchor')

    anchor_dict = dict(text_result.items() +
                       url_result.items())

    return anchor_dict

Ejemplo n.º 6

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_anchor_feature(content_id=0, content_data='', html_data=''):
    """
    a tags in p tags
    @status: optional
    @param content_data: BeautifulSoup object
    @param html_data: html string
    @todo:
    """
    soup_data = ''
    anchor_text = []
    anchor_link = []

    #get data
    if content_id:
        raw_html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(raw_html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #get a tags links and text
    anchor_data = get_anchor(soup_data)
    for a in anchor_data:
        if a.string:
            anchor_text += a.string.split()
            if a.has_attr('href'):
                anchor_link.append(a['href'])

    #run thru text analysis even with empty soup cause of the loops(fixed bug)
    if not anchor_link:
        anchor_link = ['']

    #extract text features
    anchor_text = ' '.join(anchor_text)
    text_result = text_analysis(anchor_text, var_name='anchor')

    #extract link features
    url_list = []
    for link in anchor_link:
        url_list.append(url_analysis(link, var_name='anchor'))
    url_result = sum_list_dict(url_list, 'anchor')

    anchor_dict = dict(text_result.items() + url_result.items())

    return anchor_dict

Ejemplo n.º 7

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_html_feature(content_id=0, content_data='', html_data=''):
    """
    status: required
    @param content_data: BeautifulSoup object
    @param html_data: html string
    
    @TODO: image, video, etc..
    """
    html_dict = {}
    soup = ''

    #get data
    if content_id:
        html_data = Content.get_raw_html_by_id(content_id)
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    elif content_data:
        soup = content_data
    elif html_data:
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    if not soup:
        return {}

    #extract features
    html_dict['html_num'] = get_html_num(soup)
    html_dict['html_h'] = get_html_tags(soup, HTML_HEAD)
    html_dict['html_a'] = get_html_tags(soup, HTML_A)
    html_dict['html_p'] = get_html_tags(soup, HTML_P)
    html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED)
    html_dict['html_style'] = get_html_tags(soup, HTML_STYLE)
    html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT)
    html_dict['html_meta'] = get_html_tags(soup, HTML_META)
    html_dict['html_input'] = get_html_tags(soup, HTML_INPUT)
    html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT)

    return html_dict

Ejemplo n.º 8

0

Mostrar archivo

Archivo: algorithm.py Proyecto: sysofwan/zapfeeds

def get_html_feature(content_id=0, content_data='', html_data=''):
    """
    status: required
    @param content_data: BeautifulSoup object
    @param html_data: html string
    
    @TODO: image, video, etc..
    """
    html_dict = {}
    soup = ''

    #get data
    if content_id:
        html_data = Content.get_raw_html_by_id(content_id)
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    elif content_data:
        soup = content_data
    elif html_data:
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    if not soup:
        return {}

    #extract features
    html_dict['html_num'] = get_html_num(soup)
    html_dict['html_h'] = get_html_tags(soup, HTML_HEAD)
    html_dict['html_a'] = get_html_tags(soup, HTML_A)
    html_dict['html_p'] = get_html_tags(soup, HTML_P)
    html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED)
    html_dict['html_style'] = get_html_tags(soup, HTML_STYLE)
    html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT)
    html_dict['html_meta'] = get_html_tags(soup, HTML_META)
    html_dict['html_input'] = get_html_tags(soup, HTML_INPUT)
    html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT)

    return html_dict