def sample(): blogs = commdatica.load('output/umtc.txt') has_emo = [] no_emo = [] target = 1000 i = 0 pbar = progbar.start(target) for blog in blogs: if blogger.is_valid(blog.text): if not len(has_emo) >= 500: has_emo.append(blog) i += 1 elif blogger.is_valid(blog.text, check_emo = False): if not len(no_emo) >= 500: no_emo.append(blog) i += 1 pbar.update(i) pbar.finish() print 'writing to umtc_yes_emo.txt ....', open('output/umtc_yes_emo.txt', 'w').write('\n'.join([repr(blog) for blog in has_emo])) print 'OK' print 'writing to umtc_no_emo.txt ....', open('output/umtc_no_emo.txt', 'w').write('\n'.join([repr(blog) for blog in no_emo])) print 'OK' bs = commdatica.load('output/umtc_yes_emo.txt') print len(bs)
def sample(): blogs = commdatica.load("output/umtc.txt") has_emo = [] no_emo = [] target = 1000 i = 0 pbar = progbar.start(target) for blog in blogs: if blogger.is_valid(blog.text): if not len(has_emo) >= 500: has_emo.append(blog) i += 1 elif blogger.is_valid(blog.text, check_emo=False): if not len(no_emo) >= 500: no_emo.append(blog) i += 1 pbar.update(i) pbar.finish() print "writing to umtc_yes_emo.txt ....", open("output/umtc_yes_emo.txt", "w").write("\n".join([repr(blog) for blog in has_emo])) print "OK" print "writing to umtc_no_emo.txt ....", open("output/umtc_no_emo.txt", "w").write("\n".join([repr(blog) for blog in no_emo])) print "OK" bs = commdatica.load("output/umtc_yes_emo.txt") print len(bs)
def main(): # get parameters from terminal optparser = OptionParser() optparser.add_option('-i', '--input', action = 'store', type = 'string', dest = 'infile') optparser.add_option('-o', '--output', action = 'store', type = 'string', dest = 'outfile') optparser.add_option('-a', '--account', action = 'store', type = 'string', dest = 'acc_range') optparser.add_option('-n', '--instance', action = 'store', type = 'int', dest = 'n_instance', default = 5) optparser.add_option('-r', '--restart', action = 'store_true', dest = 'restart', default = False) optparser.add_option('-t', '--interval', action = 'store', type = 'int', dest = 'interval', default = 3) opts, args = optparser.parse_args() if not opts.infile: print '-i infile not specified' return if not opts.outfile: print '-o outfile not specified' return if not opts.acc_range: print '-a (start_idx,end_idx) not specified' return else: m = re.match('(\d+),(\d+)', opts.acc_range) if not m: print '-a start_idx,end_idx should contain no space' return else: opts.acc_range = (int(m.group(1)), int(m.group(2))) ftype = 'w' if opts.restart else 'a' # prepare the accounts all_accounts = weiboparser.load_accounts() accounts = all_accounts[opts.acc_range[0]:opts.acc_range[1] + 1] # prepare the all_bloginfo = commdatica.load(opts.infile) # filter the blogs whose comments have been downloaded if opts.restart: mids = set() else: mids = set(downloaded_mids(opts.outfile)) logger.info('%d downloaded in %s'%(len(mids), opts.outfile)) bloginfos = [bloginfo for bloginfo in all_bloginfo if not bloginfo.mid in mids] # for test # bloginfos = bloginfos[:20] launch(opts.outfile, accounts, bloginfos, ftype, opts.n_instance, opts.interval)
def test(): all_accounts = weiboparser.load_accounts() accounts = all_accounts[:25] all_bloginfo = commdatica.load() # do not download comments for the same blog again mids = set(downloaded_mids()) filtered_bloginfo = [bloginfo for bloginfo in all_bloginfo if not bloginfo.mid in mids] bloginfo = filtered_bloginfo[:8] launch(JSONS_COMMENT, accounts, bloginfo, 4)
def main(): blogs = commdatica.load('output/umtc.txt') print '%d in total'%(len(blogs)) pbar = progbar.start(len(blogs)) c = 0 for i, blog in enumerate(blogs): if blogger.is_valid(blog.text, check_emo = False): c += 1 pbar.update(i + 1) pbar.finish() print '%.2f%%'%(100. * c / len(blogs))