Beispiel #1
0
class RecBySNS(object):
    def __init__(self):
        self.db = Database()
        self.nlpir = PyNLPIR(self)
        self.renren = Renren(self)
        self.url = URL(self)
        self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0"
        self.pos_blacklist_regexs = [
            "^emoticon$",
            "^title$",
            "^ude.",
            "^w.*",
            "^vshi",
            "^vyou",
            "^p.*",
            "^ule",
            "^m.*",
            "^cc",
            "^session$",
        ]

    def assign_recbysns_entity_sentiment(self):
        for status in self.db.select_table(
            "weibo_status",
            "text like '%%《%%》%%' or \
                                            text like '%%http://%%' or \
                                            text like '%%https://%%'",
            12696,
            5,
        ):
            sessions = self.nlpir.segment_weibo_status(status["text"])
            i = 0
            while i < len(sessions):
                session = sessions[i]
                entities = []
                session_text = ""
                for segment in session:
                    session_text += segment.rsplit("/", 1)[0]
                    if self.nlpir.get_POS(segment) == "title":
                        title = re.match(u"《(.*?)》/title", segment).group(1)
                        if self.db.select_douban_movie_by_title(title) or self.db.select_douban_book_by_title(title):
                            entities.append(segment)
                    elif self.nlpir.get_POS(segment) == "url":
                        match = re.search(u"(http.*)/url", segment)
                        if match is None:
                            print "###########%s###########" % segment
                            continue
                        url = match.group(1)
                        url = self.db.select_recbysns_url_by_short_url(url)
                        if url is None:
                            print "***********%s***********" % segment
                            continue
                        if self.url.is_video_url(url["origin_url"]):
                            entities.append(segment)
                positions = {}
                for entity in entities:
                    if entity in positions:
                        position = positions[entity] + 1
                        positions[entity] = position
                    else:
                        position = 0
                        positions[entity] = position
                    print status["text"]
                    print session_text
                    print entity
                    print "Type:"
                    type = int(sys.stdin.readline())
                    print "Sentiment:"
                    sentiment = int(sys.stdin.readline())
                    self.db.query(
                        "INSERT INTO recbysns_entity( \
                                   entity, status_id, session, position, \
                                   type, score) \
                                   VALUES(%s, %s, %s, %s, %s, %s)",
                        (entity, status["id"], i, position, type, sentiment),
                    )
                    self.db.commit()
                i = i + 1

    def is_blacklist_word(self, word):
        for pos_blacklist_regex in self.pos_blacklist_regexs:
            if re.search(pos_blacklist_regex, self.nlpir.get_POS(word)):
                return True
        return False
Beispiel #2
0
class Douban(object):
    def __init__(self):
        self.client = \
            DoubanClient('028bc5c2b034fb1c07a35148109ef154',
                         '2f42bec4d6a403b4',
                         'http://rec.jjyao.me',
                         'douban_basic_common,shuo_basic_r,shuo_basic_w')
        #self.client.auth_with_code('39076cd663a27f06')
        from recbysns import recbysns
        self.recbysns = recbysns
        self.db = Database()
        self.cookie = 'bid="feXKjUDU9TI"; ue="*****@*****.**"; ll="118159"; ct=y; viewed="10001392_4025068_3098478_3302642_1434275_21760836_10537640_10522595_6799191"; dbcl2="51087586:lfMA83G6vyc"; ck="hW6b"; __utma=30149280.1374624205.1362216082.1368805078.1368838128.94; __utmb=30149280.31.10.1368838128; __utmc=30149280; __utmz=30149280.1368798357.92.42.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/11529526/; __utmv=30149280.5108; __utma=81379588.1127088638.1342424134.1368807092.1368838128.118; __utmb=81379588.52.8.1368843871481; __utmc=81379588; __utmz=81379588.1368798357.115.34.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/11529526/'

    def __getattr__(self, attr):
        return _Callable(self, attr, getattr(self.client, attr))

    def book_search(self, **kw):
        """ Fallback for book.search """
        keyword = kw['q']
        books = []
        for i in [0, 15]:
            url = 'http://book.douban.com/subject_search?start=%d&search_text=%s&cat=1001' % \
                  (i, keyword.encode('utf-8'))
            html = pq(url=url, parser='html', opener=lambda url, **kw:
                      urllib2.urlopen(urllib2.Request(url, headers={
                            'User-Agent': self.recbysns.UA,
                            'Cookie': self.cookie}), timeout=10).read().
                            decode('utf8', 'ignore'))
            for book in html('#content .subject-list .subject-item'):
                book = pq(book)
                collect_info = book('.info .collect-info span')
                # I have not read the book
                if len(collect_info) > 1:
                    state = pq(collect_info[1]).text()
                    if state == u'经常读':
                        continue
                book_id = int(re.match('http://book.douban.com/subject/(\d+)/',
                              book('.pic .nbg').attr('href')).group(1))
                title = book('.info h2 a').attr('title')
                image = book('.pic .nbg img').attr('src')
                pub = book('.info .pub').text()
                rating = pq(book('.info .star'))
                if rating('.rating_nums'):
                    numRaters = int(re.match(u'\((\d+)',
                                    rating('.pl').text()).group(1))
                    average = rating('.rating_nums').text()
                    rating = {"numRaters": numRaters, "average": average}
                else:
                    rating = {"numRaters": 0, "average": 0}
                books.append({'id': book_id, 'title': title,
                              'image': image, 'pub': pub, 'rating': rating})
        return {'books': books, 'total': len(books)}

    def movie_search(self, **kw):
        """ Fallback for movie.search """
        keyword = kw['q']
        url = 'http://movie.douban.com/subject_search?search_text=%s&cat=1002' % \
              keyword.encode('utf-8')
        html = pq(url=url, parser='html', opener=lambda url, **kw:
                    urllib2.urlopen(urllib2.Request(url, headers={
                        'User-Agent': self.recbysns.UA,
                        'Cookie': self.cookie}), timeout=10).read())
        movies = []
        for movie in html('#content table .item'):
            movie = pq(movie)
            id = int(re.match('http://movie.douban.com/subject/(\d+)/',
                     movie('.nbg').attr('href')).group(1))
            image = movie('.nbg img').attr('src')
            pub = movie('.pl2>.pl').text()
            rating = pq(movie('.pl2 .star'))
            if rating and rating('.rating_nums').text():
                numRaters = int(re.match(u'\((\d+)',
                                rating('.pl').text()).group(1))
                average = rating('.rating_nums').text()
                rating = {"numRaters": numRaters, "average": average}
            else:
                rating = {"numRaters": 0, "average": 0}
            titles = [title.strip()
                      for title in movie('.pl2>a').text().split('/')]
            movies.append({'id': id, 'titles': titles,
                           'image': image, 'pub': pub, 'rating': rating})
        return {'movies': movies, 'total': len(movies)}

    def crawl_book(self):
        #start = 63831 %《%》%
        for book in self.db.select_table('douban_book', 'img_url is null'):
            try:
                result = douban.book.search(q=book['title'])
                if int(result['total']) == 0:
                    print book['title']
                    continue
                for result in result['books']:
                    if result['title'] == book['title']:
                        self.db.query("UPDATE douban_book SET id = %s, \
                                      img_url = %s, pub = %s, \
                                      raters_num = %s, score = %s \
                                      WHERE id = %s",
                                      (result['id'], result['image'],
                                       result['pub'],
                                       result['rating']['numRaters'],
                                       result['rating']['average'],
                                       book['id']))
                        self.db.commit()
                        break
            except Exception as e:
                print book['title']
                print e

    def crawl_movie(self):
        for movie in self.db.select_table('douban_movie', 'img_url is null'):
            try:
                result = douban.movie.search(q=movie['title'])
                if int(result['total']) == 0:
                    print movie['title']
                    continue
                for result in result['movies']:
                    if movie['title'] in result['titles']:
                        self.db.query("UPDATE douban_movie SET id = %s, \
                                       img_url = %s, pub = %s, \
                                       raters_num = %s, score = %s \
                                       WHERE id = %s",
                                       (result['id'], result['image'],
                                        result['pub'],
                                        result['rating']['numRaters'],
                                        result['rating']['average'],
                                        movie['id']))
                        self.db.commit()
                        break
            except Exception as e:
                print movie['title']
                print e
Beispiel #3
0
# coding=utf8
import codecs
from db import Database

NER_BOOK = 0
NER_MOVIE = 2
NER_VIDEO = 3
NER_OTHERS = -1
NER_MAGAZINE = 1
NER_MUSIC = 4
NER_TV_PROGRAM = 5

SA_POSITIVE = 1
SA_NETURAL = 0
SA_NEGATIVE = -1

db = Database()
SA_POSITIVE_EMOTICONS = set([emoticon['emoticon'] for emoticon in
                            db.select_table('weibo_emoticon', 'score = 1')])
SA_NEGATIVE_EMOTICONS = set([emoticon['emoticon'] for emoticon in
                            db.select_table('weibo_emoticon', 'score = -1')])
f = codecs.open('data/recbysns_positive_words.txt', 'r', 'utf8')
SA_POSITIVE_WORDS = set([word.strip() for word in f.readlines()])
f.close()
f = codecs.open('data/recbysns_negative_words.txt', 'r', 'utf8')
SA_NEGATIVE_WORDS = set([word.strip() for word in f.readlines()])
f.close()
f = codecs.open('data/recbysns_negatives.txt', 'r', 'utf8')
SA_NEGATIVES = set([word.strip() for word in f.readlines()])
f.close()