def get_movie_info(movie_id):
    create_app()
    process = Process.get_or_create(id=movie_id)
    if process.is_success:
        return

    print 'Strting fetch movie: {}'.format(movie_id)
    start = time.time()
    process = Process.get_or_create(id=movie_id)

    movie = Movie.objects.filter(id=movie_id)
    if not movie:
        html = get_tree(MOVIE_URL.format(movie_id))
        name = html.xpath("//div[@id='content']//h1/span/text()")[0]
        mark = html.xpath(
            "//div[@class='rating_wrap clearbox']//strong/text()")[0]
        picture = html.xpath(
            "//div[@id='content']//div[@id='mainpic']//img/@src")[0]
        movie = Movie(id=movie_id, name=name, mark=mark, picture=picture)
        movie.save()
        get_top_comment_and_user_info(movie_id, movie)
        process.make_succeed()
        print 'Finished fetch movie: {} Cost: {}'.format(
            movie_id,
            time.time() - start)
Exemple #2
0
def parser_artist(artist_id):
    create_app()
    process = Process.get_or_create(id=artist_id)  # Process以歌手为单位
    if process.is_success:
        return

    print('Starting fetch artist: {}'.format(artist_id))
    start = time.time()
    process = Process.get_or_create(id=artist_id)

    tree = get_tree(ARTIST_URL.format(artist_id))  # 使用requests获取页面文本,转化为lxml对象

    artist = Artist.objects.filter(id=artist_id)
    if not artist:  # 如果之前没抓过
        artist_name = tree.xpath('//h2[@id="artist-name"]/text()')[0]
        picture = tree.xpath(
            '//div[contains(@class, "n-artist")]//img/@src')[0]
        artist = Artist(id=artist_id, name=artist_name, picture=picture)
        artist.save()
    else:  # 如果之前抓过,但是该歌手的歌曲没抓完
        artist = artist[0]
    song_items = tree.xpath('//div[@id="artist-top50"]//ul/li/a/@href')
    songs = []
    for item in song_items:
        song_id = item.split('=')[1]
        song = parser_song(song_id, artist)  # 进入抓取和解析歌手模式
        if song is not None:
            songs.append(song)
    artist.songs = songs
    artist.save()
    process.make_succeed()  # 标记歌手下的热门歌曲的热门评论抓完
    print('Finished fetch artist: {} Cost: {}'.format(artist_id,
                                                      time.time() - start))
Exemple #3
0
def parser_artist(artist_id):
    create_app()
    process = Process.get_or_create(id=artist_id)
    if process.is_success:
        return

    print 'Starting fetch artist: {}'.format(artist_id)
    start = time.time()
    process = Process.get_or_create(id=artist_id)

    tree = get_tree(ARTIST_URL.format(artist_id))

    artist = Artist.objects.filter(id=artist_id)
    if not artist:
        artist_name = tree.xpath('//h2[@id="artist-name"]/text()')[0]
        picture = tree.xpath(
            '//div[contains(@class, "n-artist")]//img/@src')[0]
        artist = Artist(id=artist_id, name=artist_name, picture=picture)
        artist.save()
    else:
        artist = artist[0]
    song_items = tree.xpath('//div[@id="artist-top50"]//ul/li/a/@href')
    songs = []
    for item in song_items:
        song_id = item.split('=')[1]
        song = parser_song(song_id, artist)
        if song is not None:
            songs.append(song)
    artist.songs = songs
    artist.save()
    process.make_succeed()
    print 'Finished fetch artist: {} Cost: {}'.format(
        artist_id, time.time() - start)
Exemple #4
0
def parser_artist(artist_id):
    create_app()
    process = Process.get_or_create(id=artist_id)
    if process.is_success:
        return

    print 'Starting fetch artist: {}'.format(artist_id)
    start = time.time()
    process = Process.get_or_create(id=artist_id)

    tree = get_tree(ARTIST_URL.format(artist_id))

    artist = Artist.objects.filter(id=artist_id)
    if not artist:
        artist_name = tree.xpath('//h2[@id="artist-name"]/text()')[0]
        picture = tree.xpath(
            '//div[contains(@class, "n-artist")]//img/@src')[0]
        artist = Artist(id=artist_id, name=artist_name, picture=picture)
        artist.save()
    else:
        artist = artist[0]
    song_items = tree.xpath('//div[@id="artist-top50"]//ul/li/a/@href')
    songs = []
    for item in song_items:
        song_id = item.split('=')[1]
        song = parser_song(song_id, artist)
        if song is not None:
            songs.append(song)
    artist.songs = songs
    artist.save()
    process.make_succeed()
    print 'Finished fetch artist: {} Cost: {}'.format(artist_id,
                                                      time.time() - start)
Exemple #5
0
def parser_artist(artist_id):
    create_app()
    process = Process.get_or_create(id=artist_id)
    if process.is_success:
        print "find process artist finished ,return"
        return

    print 'Starting fetch artist: {}'.format(artist_id)
    start = time.time()
    process = Process.get_or_create(id=artist_id)

    tree = get_tree(ARTIST_URL.format(artist_id)) #get artist url
    if tree==None:
        print "fetch artist url get none,return !"
        return

    artist = Artist.objects.filter(id=artist_id)
    if not artist:
        print "create artist "+str(artist_id)
        artist_name = tree.xpath('//h2[@id="artist-name"]/text()')[0]
        picture = tree.xpath(
            '//div[contains(@class, "n-artist")]//img/@src')[0]
        artist = Artist(id=artist_id, name=artist_name, picture=picture)

        artist.save()

    else:
        artist = artist[0]
        print "artist exist " + str(artist_id)
    print "fetching all song comments"
    song_items = tree.xpath('//div[@id="artist-top50"]//ul/li/a/@href')
    #song_items2=tree.xpath('//ul[@class="f-hide"]/li/a/@href') the same
    songs = []
    print song_items
    if song_items==[]:
        print "Artist  get no songs ,return fetch artist {}".format(artist_id)
        return
    for item in song_items:
        song_id = item.split('=')[1]
        song = parser_song(song_id, artist)
        if song is  None:
            print "parse song failed,return "
            return
        else:
            songs.append(song)
    artist.songs = songs
    artist.save()
    process.make_succeed()
    print 'Finished fetch artist: {} Cost: {}'.format(
        artist_id, time.time() - start)