def ScrapeContents(body): dic = { u"title": u"", u"sender": u"", u"info": u"", u"all_text": u"", } soup = BeautifulSoup(body) q1 = soup.findAll('span', id="form1:htmlTitle") q2 = soup.findAll('span', id="form1:htmlFrom") q3 = soup.findAll('span', id="form1:htmlMain") q4 = soup.findAll('span', id="form1:htmlHenko") #q1 if (q1 != []): pstr = htmlentity2unicode(q1[0].string) pstr = pstr.replace(' ', '').replace(u' ', '') dic[u"title"] = pstr dic[u"all_text"] = dic[u"all_text"] + pstr print pstr #.encode('utf-8') else: dic[u"title"] = u'' #q2 if (q2 != []): pstr = htmlentity2unicode(q2[0].string) pstr = pstr.replace(' ', '').replace(u' ', '') dic[u"sender"] = pstr dic[u"all_text"] = dic[u"all_text"] + pstr else: dic[u"sender"] = u'' #q3 for a in q3: a = brRemove(a) dic2 = ScrapeMainHTML(a.contents) for key in dic2.keys(): dic[key] = dic2[key] dic[u"all_text"] = dic[u"all_text"] + a.text #q4 if (q4 != []): pstr = htmlentity2unicode(q4[0].string) pstr = pstr.replace(' ', '').replace(u' ', '') dic[u"info"] = pstr dic[u"all_text"] = dic[u"all_text"] + pstr else: dic[u"info"] = u'' return dic
def ScrapeContents(body): dic = { u"title":u"", u"sender":u"", u"info":u"", u"all_text":u"", } soup = BeautifulSoup(body) q1 = soup.findAll('span', id="form1:htmlTitle") q2 = soup.findAll('span', id="form1:htmlFrom") q3 = soup.findAll('span', id="form1:htmlMain") q4 = soup.findAll('span', id="form1:htmlHenko") #q1 if(q1 != []): pstr = htmlentity2unicode(q1[0].string) pstr = pstr.replace(' ', '').replace(u' ', '') dic[u"title"] = pstr dic[u"all_text"] = dic[u"all_text"] + pstr print pstr#.encode('utf-8') else: dic[u"title"] = u'' #q2 if(q2 != []): pstr = htmlentity2unicode(q2[0].string) pstr = pstr.replace(' ', '').replace(u' ', '') dic[u"sender"] = pstr dic[u"all_text"] = dic[u"all_text"] + pstr else: dic[u"sender"] = u'' #q3 for a in q3: a = brRemove(a) dic2 = ScrapeMainHTML(a.contents) for key in dic2.keys(): dic[key] = dic2[key] dic[u"all_text"] = dic[u"all_text"] + a.text #q4 if(q4 != []): pstr = htmlentity2unicode(q4[0].string) pstr = pstr.replace(' ', '').replace(u' ', '') dic[u"info"] = pstr dic[u"all_text"] = dic[u"all_text"] + pstr else: dic[u"info"] = u'' return dic
def ScrapeMainHTML(contents): p1 = re.compile(u"^科目名?:?(?P<target>.*)") p2 = re.compile(u"^休講日時?:?(?P<target>.*)") p3 = re.compile(u"^補講日時?:?(?P<target>.*)") p4 = re.compile(u"^日[時付]?:?(?P<target>.*)") p5 = re.compile(u"^教員名?:?(?P<target>.*)") p6 = re.compile(u"^時限:?(?P<target>.*)") p7 = re.compile(u"^補講教室:?(?P<target>.*)") p8 = re.compile(u"^備考:?(?P<target>.*)") p9 = re.compile(u"^期間?:?(?P<target>.*)") dic = { u"course": u"", u"canceled_date": u"2001-01-01", u"revenge_date": u"2001-01-01", u"date": u"2001-01-01", u"teacher": u"", u"time": u"", u"revenge_place": u"", u"remarks": u"", u"term": u"2001-01-01", } for a in contents: pstr = htmlentity2unicode(a.string.replace(' ', '').replace(u' ', '')) if (p1.search(pstr) != None): dic[u"course"] = p1.search(pstr).group('target') elif (p2.search(pstr) != None): dic[u"canceled_date"] = ScrapeDatetime( p2.search(pstr).group('target')) elif (p3.search(pstr) != None): dic[u"revenge_date"] = ScrapeDatetime( p3.search(pstr).group('target')) elif (p4.search(pstr) != None): dic[u"date"] = ScrapeDatetime(p4.search(pstr).group('target')) elif (p5.search(pstr) != None): dic[u"teacher"] = p5.search(pstr).group('target') elif (p6.search(pstr) != None): dic[u"time"] = p6.search(pstr).group('target') elif (p7.search(pstr) != None): dic[u"revenge_place"] = p7.search(pstr).group('target') elif (p8.search(pstr) != None): dic[u"remarks"] = p8.search(pstr).group('target') elif (p9.search(pstr) != None): dic[u"term"] = ScrapeDatetime(p9.search(pstr).group('target')) #else: # print "No Hit" return dic
def ScrapeMainHTML(contents): p1 = re.compile(u"^科目名?:?(?P<target>.*)") p2 = re.compile(u"^休講日時?:?(?P<target>.*)") p3 = re.compile(u"^補講日時?:?(?P<target>.*)") p4 = re.compile(u"^日[時付]?:?(?P<target>.*)") p5 = re.compile(u"^教員名?:?(?P<target>.*)") p6 = re.compile(u"^時限:?(?P<target>.*)") p7 = re.compile(u"^補講教室:?(?P<target>.*)") p8 = re.compile(u"^備考:?(?P<target>.*)") p9 = re.compile(u"^期間?:?(?P<target>.*)") dic = { u"course":u"", u"canceled_date":u"2001-01-01", u"revenge_date":u"2001-01-01", u"date":u"2001-01-01", u"teacher":u"", u"time":u"", u"revenge_place":u"", u"remarks":u"", u"term":u"2001-01-01", } for a in contents: pstr = htmlentity2unicode(a.string.replace(' ', '').replace(u' ','')) if(p1.search(pstr) != None): dic[u"course"] = p1.search(pstr).group('target') elif(p2.search(pstr) != None): dic[u"canceled_date"] = ScrapeDatetime(p2.search(pstr).group('target')) elif(p3.search(pstr) != None): dic[u"revenge_date"] = ScrapeDatetime(p3.search(pstr).group('target')) elif(p4.search(pstr) != None): dic[u"date"] = ScrapeDatetime(p4.search(pstr).group('target')) elif(p5.search(pstr) != None): dic[u"teacher"] = p5.search(pstr).group('target') elif(p6.search(pstr) != None): dic[u"time"] = p6.search(pstr).group('target') elif(p7.search(pstr) != None): dic[u"revenge_place"] = p7.search(pstr).group('target') elif(p8.search(pstr) != None): dic[u"remarks"] = p8.search(pstr).group('target') elif(p9.search(pstr) != None): dic[u"term"] = ScrapeDatetime(p9.search(pstr).group('target')) #else: # print "No Hit" return dic
try: result = urlfetch.fetch(api_url) except: movie.check_time = datetime.datetime.today() movie.put() else: if result.status_code == 200: pattern = re.compile(r'<id>tag:search.twitter.com,2005:(.*?)</id>') tweet_ids = pattern.findall(result.content) if len(tweet_ids) > 1: movie.since_id = tweet_ids[1] pattern2 = re.compile(r'<title>(.*?)</title>') tweet_titles = pattern2.findall(result.content) movie.tweet_count = movie.tweet_count + len(tweet_titles) - 1 for i in range(len(tweet_titles)): if i > 0: tweet = htmlentity2unicode.htmlentity2unicode(tweet_titles[i]).encode('utf-8') pattern_good = re.compile(r'よい|よかった|すごい|すごか|おもしろい|おもしろか|かっこよい|かっこよか|わら|たのしい|たのしめた|たのしか|すばらしい|すばらしか|ヨイ|ヨカッタ|スゴイ|スゴカ|オモシロイ|オモシロカ|カッコヨイ|カッコヨカ|ワラ|タノシイ|タノシメタ|タノシカ|スバラシイ|スバラシカ|グッド|ナイス|良い|良かった|凄い|凄か|面白い|面白か|格好良い|格好良か|笑|楽しい|楽しめた|楽しか|素晴らしい|素晴らしかった|good|nice') score_good_result = pattern_good.findall(tweet) score_good_point = len(score_good_result) pattern_bad = re.compile(r'わるい|わるかった|すごくな|おもしろくな|かっこよくな|わらえな|たのしくな|たのしめな|すばらしくな|しょうも|つまら|ひど|ワルイ|ワルカッタ|スゴクナ|オモシロクナ|カッコヨクナ|ワラエナ|タノシクナ|タノシメナ|スバラシクナ|ショウモ|ツマラ|ヒド|バッド|悪い|悪かった|凄くな|面白くな|格好良くな|笑えな|楽しくな|楽しめな|素晴らしくな|詰まら|酷|bad') score_bad_result = pattern_bad.findall(tweet) score_bad_point = len(score_bad_result) if score_good_point > score_bad_point: movie.score_good = movie.score_good + 1 elif score_good_point < score_bad_point: movie.score_bad = movie.score_bad + 1 else: movie.score_other = movie.score_other + 1 movie.check_time = datetime.datetime.today() movie.put()