예제 #1
0
def get_data_reddit():

    for url in REDDIT_RSS:
        data = rss_data(url)
        for content in data:
            print 'Storing data from ' + url
            print 'TITLE:' + content['title'] + ' URL: ' + content['url']
            print '------------------------------------------------------'
            time.sleep(0.5)
            Content.create_or_update_content(db.session,**content)
        db.session.commit()
예제 #2
0
파일: rest.py 프로젝트: sysofwan/zapfeeds
def get_filtered_content(page):
    types = request.args.getlist("type")
    tags = request.args.getlist("tag")
    query = request.args.get("query")
    contents = []
    if tags:
        contents.extend([content.fp_serialize for content in Content.get_top_tag_filtered(page, tags)])
    elif types:
        contents.extend([content.fp_serialize for content in Content.get_top_type_filtered(page, types)])
    elif query:
        contents.extend([content.fp_serialize for content in Content.get_top_for_query(query, page)])
    return jsonify({"results": contents})
예제 #3
0
파일: rest.py 프로젝트: sysofwan/zapfeeds
def get_top_content():
    valid_cookie = is_cookie_valid()
    history = get_history() if valid_cookie else []
    page_no = get_page_no() if valid_cookie else None
    if len(history) < 360:
        contents = [content.fp_serialize for content in Content.get_top_unviewed(history)]
    else:
        if not page_no:
            page_no = 1
        contents = [content.fp_serialize for content in Content.get_top_by_pages(page_no, history)]
    response = jsonify({"results": contents})
    if not valid_cookie:
        reset_cookie(response)
    set_cookie_time(response)
    return response
예제 #4
0
def get_heading_feature(content_id=0, content_data='', html_data=''):
    """
    status: optional
    """
    soup_data = ''

    #get data
    if content_id:
        html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #extract h tags and features
    heading_data = get_heading_word(soup_data)
    heading_dict = text_analysis(heading_data, var_name='heading')

    return heading_dict
예제 #5
0
파일: main.py 프로젝트: sysofwan/zapfeeds
def is_duplicate_content(feed):
    # TODO:  If there is still duplicate, we can check using raw title and domain
    feed_id = get_feed_id(feed)
    content = Content.get_content_by_feed_id(feed_id)
    if content:
        return True
    return False
예제 #6
0
def get_url_feature(content_id=0, content_data=''):
    """
    status: required
    @todo:
    """
    url = ''

    #get data
    if content_id:
        url = Content.get_content_by_id(content_id).url
    elif content_data:
        url = content_data
    if not url:
        return {}

    #extract text from url path features
    url_path = link_to_text(url)
    text_result = text_analysis(url_path, var_name='url', head_body='head')

    #extract url features
    url_result = url_analysis(url, var_name='url')

    #combine dict data
    url_dict = dict(text_result.items() +
                    url_result.items())

    return url_dict
예제 #7
0
def update_parent_cluster(clusters, contents, session):
    """
    """
    id_cluster = format_cluster(clusters)
    content_ids = [row[0] for row in contents]
    for content_id in content_ids:
        content = Content.get_content_by_id(content_id)
        if not content.parent_cluster:
            content.parent_cluster = id_cluster[content_id] if content_id in id_cluster else 0
            session.add(content)
        else:
            #update contents previously clustered
            if content_id in id_cluster:
                clustered_contents = Content.get_content_by_parent_cluster(content.parent_cluster)
                for clustered_content in clustered_contents:
                    clustered_content.parent_cluster = id_cluster[content_id]
                    session.add(clustered_content)
    session.commit()
예제 #8
0
def rank_contents():
    logger.info('starting content ranking...')
    start_time = time.time()
    contents = Content.get_content_for_ranking(3)
    for content in contents:
        content.rank = rank_content(content)
        session.add(content)
    session.commit()
    logger.info('ranking completed in %s', str(timedelta(seconds=time.time() - start_time)))
예제 #9
0
def requestRssData(url, google=False, newsvine=False, fark=False, force_refresh=False):
    content = feedparser.parse(url)
    data = []
    for i in content.entries:
        dictData = {}
        if not force_refresh and Content.get_content_by_url(i.link):
            print 'content in database, continue..'
            continue
        dictData['raw_url'] = i.link
        dictData['description'] = cleanSoupHtml(i.description).get_text()
        dictData['title'] = i.title
        try:
            dictData['timestamp'] = i.published_parsed
        except:
            continue
        #check if rss from google news rss, clean url
        if google:
            url_primary = i.link.split('&url=')[1]
            url_secondary = ''
        elif newsvine:
            try:
                soup = BeautifulSoup(requests.get(i.link).text)
                url_primary = soup.find('span', {'class': 'c-seed-source'}).a['href']
                url_secondary = ''
            except:
                print 'problem opening newsvine link'
                continue
        elif fark:
            dictData['title'] = dictData['title'].split('[')[0]
            try:
                soup = BeautifulSoup(requests.get(i.id).text)
                url_temp = soup.find('a', {'class': 'outbound_link'})['href']
                url_primary = url_temp[url_temp.rfind('http'):]
                url_secondary = ''
            except:
                print 'problem opening fark link'
                continue
        else:
            try:
                url_primary = i.id
                url_secondary = i.link
            except AttributeError:
                url_primary = i.link
                url_secondary = ''
            except:
                continue

        #get the content if url is valid
        urlContentData = url_content(url_primary, url_secondary)
        if urlContentData:
            dictData = dict(dictData.items() + urlContentData.items())
        else:
            continue

        data.append(dictData)
        time.sleep(1)
    return data
예제 #10
0
def get_text(content_id):
    raw_html = Content.get_raw_html_by_id(content_id)
    try:
        text = Extractor(extractor='ArticleExtractor', html=raw_html).getText()
    except Exception as e:
        logger.exception('\nError extracting text from html. Exception: %s, %s',
                         e.__class__.__name__, e)
        return ''
    return text
예제 #11
0
def get_anchor_feature(content_id=0, content_data='', html_data=''):
    """
    a tags in p tags
    @status: optional
    @param content_data: BeautifulSoup object
    @param html_data: html string
    @todo:
    """
    soup_data = ''
    anchor_text = []
    anchor_link = []

    #get data
    if content_id:
        raw_html = Content.get_raw_html_by_id(content_id)
        try:
            soup_data = BeautifulSoup(raw_html)
        except:
            pass
    elif content_data:
        soup_data = content_data
    elif html_data:
        try:
            soup_data = BeautifulSoup(html_data)
        except:
            pass
    if not soup_data:
        soup_data = BeautifulSoup('')

    #get a tags links and text
    anchor_data = get_anchor(soup_data)
    for a in anchor_data:
        if a.string:
            anchor_text += a.string.split()
            if a.has_attr('href'):
                anchor_link.append(a['href'])

    #run thru text analysis even with empty soup cause of the loops(fixed bug)
    if not anchor_link:
        anchor_link = ['']

    #extract text features
    anchor_text = ' '.join(anchor_text)
    text_result = text_analysis(anchor_text, var_name='anchor')

    #extract link features
    url_list = []
    for link in anchor_link:
        url_list.append(url_analysis(link, var_name='anchor'))
    url_result = sum_list_dict(url_list, 'anchor')

    anchor_dict = dict(text_result.items() +
                       url_result.items())

    return anchor_dict
예제 #12
0
def load_database(force_refresh=False):
    sources = ContentSource.query.all()
    for source in sources:
        url = source.url
        if 'google.com' in url:
            data = requestRssData(url, google=True, force_refresh=force_refresh)
        elif 'newsvine.com' in url:
            data = requestRssData(url, newsvine=True, force_refresh=force_refresh)
        elif 'feedsportal.com/c/35344/f' in url:
            data = requestRssData(url, fark=True, force_refresh=force_refresh)
        elif 'reddit.com' in url:
            data = rss_data(url)
        else:
            data = requestRssData(url, force_refresh=force_refresh)
        for content in data:
            content['source_id'] = source.id
            print 'Storing ' + content['url'] + ' from ' + url + ' ...'
            Content.create_or_update_content(db.session, **content)
        db.session.commit()
    print 'done loading database...'
예제 #13
0
def get_body():
    """
    parameters: html - extract text from html
    returns: <text><social shares>
    """
    result = []
    ids = [content.id for content in Content.query.all()]

    for content_id in ids:
        content_data = Content.get_content_by_id(content_id)
        result.append([html_to_text(content_data), content_data.real_shares])
    return result
예제 #14
0
def get_description_feature(content_id=0, content_data=''):
    """
    status: optional
    @todo:
    """
    desc_data = ''

    #get data
    if content_id:
        desc_data = Content.get_content_by_id(content_id).description
        #use title data for analysis
        if not desc_data:
            desc_data = Content.get_content_by_id(content_id).title
    elif content_data:
        desc_data = content_data
    if not desc_data:
        desc_data = ''

    #extact feature
    desc_dict = text_analysis(desc_data, var_name='desc', head_body='head')

    return desc_dict
예제 #15
0
def populate_real_shares():
    logger.info('starting real shares population...')
    start_time = time.time()
    contents = Content.get_content_no_real_shares_by_age(3)
    for content in contents:
        try:
            total_share = get_total_shares(content.url)
        except Exception:
            logger.exception('Unable to fetch real shares for content with id %s', content.id)
            continue
        content.real_shares = total_share
        session.add(content)
    session.commit()
    logger.info('real shares population completed in %s', str(timedelta(seconds=time.time() - start_time)))
예제 #16
0
def populate_social_count():
    logger.info('starting social count population...')
    start_time = time.time()
    contents = Content.get_unranked_contents_by_age(1)
    for content in contents:
        try:
            social_share = get_social_share(content.url)
        except Exception:
            logger.exception('Unable to fetch social count for content with id: %s.',
                         content.id)
            continue
        social_share.content_id = content.id
        content.predicted_shares = predicted_shares(social_share, content)
        session.add(social_share)
        session.add(content)
    session.commit()
    logger.info('social count population completed in %s', str(timedelta(seconds=time.time() - start_time)))
예제 #17
0
def populate_real_shares():
    logger.info('starting real shares population...')
    start_time = time.time()
    contents = Content.get_content_no_real_shares_by_age(3)
    for content in contents:
        try:
            total_share = get_total_shares(content.url)
        except Exception:
            logger.exception(
                'Unable to fetch real shares for content with id %s',
                content.id)
            continue
        content.real_shares = total_share
        session.add(content)
    session.commit()
    logger.info('real shares population completed in %s',
                str(timedelta(seconds=time.time() - start_time)))
예제 #18
0
def get_icon_feature(content_id=0, content_data=''):
    """
    status: optional
    """
    data = ''

    #get data
    if content_id:
        data = Content.get_content_by_id(content_id).icon_url
    elif content_data:
        data = content_data

    #return result
    if data:
        return {'icon': 1}
    else:
        return {'icon': 0}
예제 #19
0
def get_title_feature(content_id=0, content_data=''):
    """
    status: required
    """
    title = ''

    #get data
    if content_id:
        title = Content.get_content_by_id(content_id).title
    elif content_data:
        title = content_data
    if not title:
        return {}

    #extract text feature
    title_dict = text_analysis(title, var_name='title', head_body='head')

    return title_dict
예제 #20
0
def cluster_content():
    """
    @processes:
    1.get data: new content + top n contents
    2.cluster
    3.determine parent cluster
    4.update parent cluster
    5.store results

    @todo:
    1.use better feature vectorizer (word2vec?)
    2.display the cluster
    """
    logger.info('starting clustering sequence...')
    start_time = time.time()
    contents = Content.get_content_for_clustering()
    clusters = cluster_news(contents, train=True)
    update_parent_cluster(clusters, contents, session)
    logger.info('clustering completed in %s', str(timedelta(seconds=time.time() - start_time)))
예제 #21
0
def populate_social_count():
    logger.info('starting social count population...')
    start_time = time.time()
    contents = Content.get_unranked_contents_by_age(1)
    for content in contents:
        try:
            social_share = get_social_share(content.url)
        except Exception:
            logger.exception(
                'Unable to fetch social count for content with id: %s.',
                content.id)
            continue
        social_share.content_id = content.id
        content.predicted_shares = predicted_shares(social_share, content)
        session.add(social_share)
        session.add(content)
    session.commit()
    logger.info('social count population completed in %s',
                str(timedelta(seconds=time.time() - start_time)))
예제 #22
0
def get_data(news_number=1000):
    """
    todo:
    get text from db? html?
    """
    data = []
    counter = 0
    ids = id_from_database()
    for news_id in ids:
        if counter >= news_number:
            break
        content = Content.get_content_by_id(news_id)
        title = content.title
        description = content.description
        if not description:
            description = title
        data.append([news_id, title, description])
        counter += 1

    return data
예제 #23
0
def cluster_content():
    """
    @processes:
    1.get data: new content + top n contents
    2.cluster
    3.determine parent cluster
    4.update parent cluster
    5.store results

    @todo:
    1.use better feature vectorizer (word2vec?)
    2.display the cluster
    """
    logger.info('starting clustering sequence...')
    start_time = time.time()
    contents = Content.get_content_for_clustering()
    clusters = cluster_news(contents, train=True)
    update_parent_cluster(clusters, contents, session)
    logger.info('clustering completed in %s',
                str(timedelta(seconds=time.time() - start_time)))
예제 #24
0
def get_html_feature(content_id=0, content_data='', html_data=''):
    """
    status: required
    @param content_data: BeautifulSoup object
    @param html_data: html string
    
    @TODO: image, video, etc..
    """
    html_dict = {}
    soup = ''

    #get data
    if content_id:
        html_data = Content.get_raw_html_by_id(content_id)
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    elif content_data:
        soup = content_data
    elif html_data:
        try:
            soup = BeautifulSoup(html_data)
        except:
            pass
    if not soup:
        return {}

    #extract features
    html_dict['html_num'] = get_html_num(soup)
    html_dict['html_h'] = get_html_tags(soup, HTML_HEAD)
    html_dict['html_a'] = get_html_tags(soup, HTML_A)
    html_dict['html_p'] = get_html_tags(soup, HTML_P)
    html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED)
    html_dict['html_style'] = get_html_tags(soup, HTML_STYLE)
    html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT)
    html_dict['html_meta'] = get_html_tags(soup, HTML_META)
    html_dict['html_input'] = get_html_tags(soup, HTML_INPUT)
    html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT)

    return html_dict
예제 #25
0
def get_timestamp_feature(content_id=0, content_data=''):
    """
    status: required
    @todo:
    """
    date_dict = {}
    time_data = ''

    #get data
    if content_id:
        time_data = Content.get_content_by_id(content_id).timestamp
    elif content_data:
        time_data = content_data
    if not time_data:
        return {}

    #extract feature
    date_dict['timestamp_day'] = day_published(time_data)
    date_dict['timestamp_hour'] = hour_published(time_data)

    return date_dict
예제 #26
0
def get_content_type_feature(content_id=0, content_data=''):
    """
    status: optional
    """
    data = 0

    #get data
    if content_id:
        data_temp = Content.get_content_by_id(content_id).type_id
        if data_temp:
            if data_temp.isdigit():
                data = int(data_temp)
    elif content_data:
        data = content_data

    if not data:
        content_type = 0
    else:
        content_type = data.id

    return {'content_type': content_type}
예제 #27
0
파일: main.py 프로젝트: sysofwan/zapfeeds
 def is_duplicate_url(self):
     content = Content.get_content_by_link(self.url)
     if content:
         return True
     return False
예제 #28
0
파일: main.py 프로젝트: sysofwan/zapfeeds
def generate_content(content_data, source_id, session):
    content = Content()
    content.url = content_data.url
    content.feed_id = get_feed_id(content_data.feed)
    content.title = content_data.title
    content.description = content_data.description
    content.image_url = content_data.image_url
    content.icon_url = content_data.icon_url
    content.timestamp = content_data.timestamp
    content.content_source_id = source_id

    content.tags = get_tags(content_data, session)
    content.site_name = get_site_name(content_data.soup, session)
    content.type = content_data.type

    features = Extract(content_data)
    content.feature_extraction = features.get_feature(convert_string=True)

    return content