コード例 #1
0
def invite():
    dbh1 = BlogsDB.BlogsDB_Handler()  # used to get all the valid profiles and necessary info

    ''' select profiles that 
        1) has an email
        2) has not been considered as invalid (with too few words)
        3) has not been sent an email
    '''
    cmd = '''
          select p.url, p.email, pb.blog_url, blogs.name 
          from profiles as p, profiles_blogs as pb, blogs 
          where p.url = pb.profile_url 
          and pb.blog_url = blogs.url
          and p.email is not null
          and p.url not in (select url from invalid_profiles) 
          and p.url not in (select profile_url from profiles_tokens)
          limit 5;
          '''
    cmd = '''
          select p.url, p.email, pb.blog_url, blogs.name 
          from profiles as p, profiles_blogs as pb, blogs 
          where p.url = pb.profile_url 
          and pb.blog_url = blogs.url
          and p.url = 'http://www.blogger.com/profile/00420783423172718178';
          '''
    

    #ipdb.set_trace()
    profiles_detail = {} 

    for e in dbh1.exec_and_get(stmt=cmd, params=[]):
        profile_url = e[0]
        email = e[1]
        blog_url = e[2]
        blog_name = e[3]

        if profile_url not in profiles_detail:
            profiles_detail[profile_url] = {}

        profiles_detail[profile_url]['email'] = email
        profiles_detail[profile_url]['name'] = blog_name

        if 'blog_posts' not in profiles_detail[profile_url]:
            profiles_detail[profile_url]['blog_posts'] = {}
        profiles_detail[profile_url]['blog_posts'][blog_url] = partial_result(blog_url)

    for profile_url in profiles_detail:
        longest_blog = []     # the blog of the most words of the user
        max_words = -1        # number of words in longest_blog

        all_posts = []

        for blog_url in profiles_detail[profile_url]['blog_posts']:
            num_words = 0       # number of words in this blog
            for post in profiles_detail[profile_url]['blog_posts'][blog_url]:
                txt = post['content']
                soup = BeautifulSoup(txt)
                post['content'] = soup.get_text()               # parse the post in case of raw html format
                num_words += len(post['content'].split())

                all_posts.append(post)

            if num_words > max_words:
                max_words = num_words
                longest_blog = profiles_detail[profile_url]['blog_posts'][blog_url]

        if max_words < MIN_NUM_WORD:
            stmt = 'insert into invalid_profiles values(%s);'
            dbh1.exec_stmt(stmt, [profile_url])
            print 'This profile has too few words'
            continue
        
        ctx = {'blog_name': profiles_detail[profile_url]['name'].replace("\'", '')}
        #ctx['wf_vs_month'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='month')
        #ctx['wf_vs_year'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='year')
        #ctx['wf_vs_week'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='week')
        #ctx['wf_vs_day'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='day')
        ctx['word_cloud'] = visualize.word_cloud(longest_blog)

        token = get_token(profile_url)

        ctx['id'] = token
        ctx['surveys_taken'] = ', '.join(qualtrics_get.surveys_taken(profile_url)) 
        
        # compute the big_5 score before sending email
        visualize.get_personality(profile_url, all_posts, dbh1)

        send_email('*****@*****.**', ctx) 
コード例 #2
0
def search_blog_by_link(request):

    if 'link' not in request.GET:
        return HttpResponse('Please input a url to the blog')

    dbh = BlogsDB.BlogsDB_Handler()
    blog_link = request.GET['link']
    MAX_TO_DISPLAY = int(request.GET['num_posts'])
    MAX_TO_DISPLAY = min(MAX_TO_DISPLAY, 200)

    posts = dbh.get_posts_in_blog(blog_link)

    # ipdb.set_trace()
    if len(posts) == 0:
        latest = -1
    else:
        latest = posts[-1]['published']

    profile, blog, new_posts, next_page_token = get.get_blog_by_link(blog_link, latest, MAX_TO_DISPLAY)

    #assert 1==2
    if blog is None:
        return HttpResponse('Please input a valid Blogger url')
    
    if 'image_url' not in profile:
        # save profile and its blogs when failing to scrape it right now
        save_profile(profile['url'], blog['url'])

    posts.extend(new_posts)

    if len(posts) == 0:
        return HttpResponse('Oops. Seems like you have published nothing in this blog')
    
    mask = MAX_TO_DISPLAY if len(posts) > MAX_TO_DISPLAY else len(posts)
    
    ctx = {'blog_name': blog['name'].replace("\'", '')}

    # visualization
    ctx['wf_vs_month'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='month')
    ctx['wf_vs_year'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='year')
    ctx['wf_vs_week'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='week')
    ctx['wf_vs_day'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='day')
    
    '''
    with open(dirname + '/debug.txt', 'w') as f:
        f.write(posts[0]['author']['url'])
    '''
    personality_url = visualize.get_personality(posts[0]['author_url'], posts[-mask:], dbh)

    ctx['personality_url'] = personality_url

    ctx['word_cloud'] = visualize.word_cloud(posts[-mask:])
    
    ctx['le_classes'] = visualize.ling_ethnography(posts[-mask:])

    ctx['ngram_model'] = visualize.ngram_model(posts[-mask:])

    # update the database
    dbh.batch_update(profile, blog, new_posts)

    # spawn a parallel process the retrieve the remaining posts
    if next_page_token:
        proc = mp.Process(target=get.get_remain_posts,
                          args=(blog_link, blog['id'], next_page_token, get.MAX_TO_DISPLAY - len(posts), latest))
        proc.start()

    ctx = Context(ctx)

    dbh.close()
    return render(request, 'blog_search_result.html', ctx)