def invite(): dbh1 = BlogsDB.BlogsDB_Handler() # used to get all the valid profiles and necessary info ''' select profiles that 1) has an email 2) has not been considered as invalid (with too few words) 3) has not been sent an email ''' cmd = ''' select p.url, p.email, pb.blog_url, blogs.name from profiles as p, profiles_blogs as pb, blogs where p.url = pb.profile_url and pb.blog_url = blogs.url and p.email is not null and p.url not in (select url from invalid_profiles) and p.url not in (select profile_url from profiles_tokens) limit 5; ''' cmd = ''' select p.url, p.email, pb.blog_url, blogs.name from profiles as p, profiles_blogs as pb, blogs where p.url = pb.profile_url and pb.blog_url = blogs.url and p.url = 'http://www.blogger.com/profile/00420783423172718178'; ''' #ipdb.set_trace() profiles_detail = {} for e in dbh1.exec_and_get(stmt=cmd, params=[]): profile_url = e[0] email = e[1] blog_url = e[2] blog_name = e[3] if profile_url not in profiles_detail: profiles_detail[profile_url] = {} profiles_detail[profile_url]['email'] = email profiles_detail[profile_url]['name'] = blog_name if 'blog_posts' not in profiles_detail[profile_url]: profiles_detail[profile_url]['blog_posts'] = {} profiles_detail[profile_url]['blog_posts'][blog_url] = partial_result(blog_url) for profile_url in profiles_detail: longest_blog = [] # the blog of the most words of the user max_words = -1 # number of words in longest_blog all_posts = [] for blog_url in profiles_detail[profile_url]['blog_posts']: num_words = 0 # number of words in this blog for post in profiles_detail[profile_url]['blog_posts'][blog_url]: txt = post['content'] soup = BeautifulSoup(txt) post['content'] = soup.get_text() # parse the post in case of raw html format num_words += len(post['content'].split()) all_posts.append(post) if num_words > max_words: max_words = num_words longest_blog = profiles_detail[profile_url]['blog_posts'][blog_url] if max_words < MIN_NUM_WORD: stmt = 'insert into invalid_profiles values(%s);' dbh1.exec_stmt(stmt, [profile_url]) print 'This profile has too few words' continue ctx = {'blog_name': profiles_detail[profile_url]['name'].replace("\'", '')} #ctx['wf_vs_month'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='month') #ctx['wf_vs_year'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='year') #ctx['wf_vs_week'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='week') #ctx['wf_vs_day'] = visualize.words_vs_time(posts=longest_blog, freq_words=[], group_by='day') ctx['word_cloud'] = visualize.word_cloud(longest_blog) token = get_token(profile_url) ctx['id'] = token ctx['surveys_taken'] = ', '.join(qualtrics_get.surveys_taken(profile_url)) # compute the big_5 score before sending email visualize.get_personality(profile_url, all_posts, dbh1) send_email('*****@*****.**', ctx)
def search_blog_by_link(request): if 'link' not in request.GET: return HttpResponse('Please input a url to the blog') dbh = BlogsDB.BlogsDB_Handler() blog_link = request.GET['link'] MAX_TO_DISPLAY = int(request.GET['num_posts']) MAX_TO_DISPLAY = min(MAX_TO_DISPLAY, 200) posts = dbh.get_posts_in_blog(blog_link) # ipdb.set_trace() if len(posts) == 0: latest = -1 else: latest = posts[-1]['published'] profile, blog, new_posts, next_page_token = get.get_blog_by_link(blog_link, latest, MAX_TO_DISPLAY) #assert 1==2 if blog is None: return HttpResponse('Please input a valid Blogger url') if 'image_url' not in profile: # save profile and its blogs when failing to scrape it right now save_profile(profile['url'], blog['url']) posts.extend(new_posts) if len(posts) == 0: return HttpResponse('Oops. Seems like you have published nothing in this blog') mask = MAX_TO_DISPLAY if len(posts) > MAX_TO_DISPLAY else len(posts) ctx = {'blog_name': blog['name'].replace("\'", '')} # visualization ctx['wf_vs_month'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='month') ctx['wf_vs_year'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='year') ctx['wf_vs_week'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='week') ctx['wf_vs_day'] = visualize.words_vs_time(posts=posts[-mask:], freq_words=[], group_by='day') ''' with open(dirname + '/debug.txt', 'w') as f: f.write(posts[0]['author']['url']) ''' personality_url = visualize.get_personality(posts[0]['author_url'], posts[-mask:], dbh) ctx['personality_url'] = personality_url ctx['word_cloud'] = visualize.word_cloud(posts[-mask:]) ctx['le_classes'] = visualize.ling_ethnography(posts[-mask:]) ctx['ngram_model'] = visualize.ngram_model(posts[-mask:]) # update the database dbh.batch_update(profile, blog, new_posts) # spawn a parallel process the retrieve the remaining posts if next_page_token: proc = mp.Process(target=get.get_remain_posts, args=(blog_link, blog['id'], next_page_token, get.MAX_TO_DISPLAY - len(posts), latest)) proc.start() ctx = Context(ctx) dbh.close() return render(request, 'blog_search_result.html', ctx)