コード例 #1
0
ファイル: get_movies.py プロジェクト: ylbupt/movie
def get_douban_movies(parser,mlist):
    #print ename
    #ename ,year = get_title_year(ename)
    #print ename 
    #ename = "Le domaine des dieux"
    lastres=[]
    if len(mlist) ==0:
        return []

    handler = site_handler.get_site_handler("www.douban.com",parser) 
    for m in mlist:
    
        res = []
        try:
            url="http://movie.douban.com/subject_search?search_text="
            if len(m.ename)>0:
                lurl="http://movie.douban.com/subject_search?search_text="+m.ename
                print "get douban",lurl
                page = utils.crawl_timeout(lurl,15,3)
                if page ==None:
                    print "ERROR,timeout,douban,url=",lurl
                    continue
                reslist = handler.dir_parse(url,page)
                for ti in reslist:
                    if is_maybe_ok(ti,m):
                        ti.search_key = m.ename
                        print "xxxx"
                        res.append(ti)
 
            time.sleep(1)
            lurl="http://movie.douban.com/subject_search?search_text="+m.cname
            print "get douban",lurl
            page = utils.crawl_timeout(lurl,15,3)
            if page ==None:
                print "ERROR,timeout,douban,url=",lurl
                continue
            reslist = handler.dir_parse(url,page)
            for ti in reslist:
                if is_maybe_ok(ti,m):

                    print "zzz"
                    ti.search_key = m.cname
                    res.append(ti)

            time.sleep(1)
            ti = get_most_like(m,res)
            if ti !=None:
                print "get it",ti.url,ti.raw,ti.pic_url
                ti.mid = ti.url.split('subject/')[1].split('/')[0]
                m.mid = ti.mid
                print "ppppp",m.mid
                lastres.append(ti)
        except Exception,e:

            traceback.print_exc(sys.stdout)  
            print "ERROR:",e
            print "ERROR:get douban search error",m.raw
            print "ERROR:get douban ",m.ename,"//",m.cname
            continue
コード例 #2
0
ファイル: get_movies.py プロジェクト: ylbupt/movie
def get_imdb_movies(parser,mlist):
    res = []
    if len(mlist) ==0:
        return []

    handler = site_handler.get_site_handler("www.imdb.com",parser) 

    for m in mlist:
    
        if has_chinese(m.ename):
            continue
        try:
            url="http://www.imdb.com/find?q="
            if len(m.ename)>0:

                key = m.ename.replace('.',' ')
                lurl="http://www.imdb.com/find?q="+key
                print lurl
                page = utils.crawl_timeout(lurl,15,3)
                if page ==None:
                    print "ERROR,timeout,imdb,url=",lurl
                    continue

                dlist = handler.dir_parse(url,page) 
                for ti in dlist:
                    print "get imdb dir_parse",ti.raw,ti.url,ti.year
                    imdbid = ti.url.split("title/")[1].split("/")[0]
                    if "tt" in imdbid:
                        ti.imdbid = imdbid.replace('tt','')
                    print "m ename",m.ename
                    print "m year",m.year
                    if abs(int(ti.year)  - int(m.year)) <=1:
                        m.imdbid = ti.imdbid
                        res.append(ti)
                        print "print add imdb result",ti.url
                        break
            time.sleep(1)

        except Exception,e:

            traceback.print_exc()  
            print "ERROR:",e
            print "ERROR:get imdb search error",m.raw
            print "ERROR:get imdb ",m.ename,"//",m.cname
            continue
コード例 #3
0
ファイル: get_movies.py プロジェクト: ylbupt/movie
def get_douban_movies2(parser,mlist):
    #print ename
    #ename ,year = get_title_year(ename)
    #print ename 
    #ename = "Le domaine des dieux"

    if len(mlist) ==0:
        return []

    handler = site_handler.get_site_handler("www.douban.com",parser) 
    res = []
    for k,v in mlist.items():
    
        try:
            url="http://movie.douban.com/subject_search?search_text="
            if len(k)>0:
                key = k.replace('.',' ')
                lurl="http://movie.douban.com/subject_search?search_text="+key
                print "get douban",lurl
                page = utils.crawl_timeout(lurl,15,3)
                if page ==None:
                    print "ERROR,timeout,douban,url=",lurl
                    continue
                reslist = handler.dir_parse(url,page)
                maybelist = []
                for ti in reslist:
                    for m in v:
                        if is_maybe_ok(ti,m):
                            ti.search_key = k
                            maybelist.append(ti)
                ti = get_most_like(v,maybelist)             
                if ti:
                    res.append(ti)
 
            time.sleep(1)

        except Exception,e:

            traceback.print_exc(sys.stdout)  
            print "ERROR:",e
            print "ERROR:get douban search error",k
            continue
コード例 #4
0
ファイル: get_movies.py プロジェクト: ylbupt/movie
def get_imdb_movies2(parser,mmap):
    res = []
    if len(mmap) ==0:
        return []

    search_list = []
    handler = site_handler.get_site_handler("www.imdb.com",parser) 

    for k,v  in mmap.items():
        if has_chinese(k):
            continue
    
        try:
            url="http://www.imdb.com/find?q="
            if len(k)>0:
                key = k.replace('.',' ')
                lurl="http://www.imdb.com/find?q="+key
                print lurl
                page = utils.crawl_timeout(lurl,15,3)
                if page ==None:
                    print "ERROR,timeout,imdb,url=",lurl
                    continue

                dlist = handler.dir_parse(url,page) 
                for ti in dlist:
                    for m in v:
                        if abs(int(ti.year)  - int(m.year)) <=1:
                            ti.search_key = k
                            res.append(ti)
            time.sleep(1)

        except Exception,e:

            traceback.print_exc()  
            print "ERROR:",e
            print "ERROR:get imdb search error",k
            continue
コード例 #5
0
ファイル: update2db.py プロジェクト: ylbupt/movie
            continue
        elif "imdb" in url:
            if url not in imdburlmap:
                #print "get imdb", url
                imdburlmap[url] = 1
            continue

        try:
            if "http://" in url:
                print "craw link", url
                page = utils.crawl_timeout(url, 15, 3)
                #       print "xxx",lurl
                if page == None:
                    print "ERROR:", url
                else:
                    handler = get_site_handler(url, parser)
                    it, urls = handler.parse(url, page)
                    if it != None:
                        if it.cname == None or it.cname == "":
                            it.cname = urllist[2]
                            it.ename = urllist[3]
                            it.date = urllist[4]
                        it.mid = urllist[5]
                        it.imdbid = urllist[6]
                        if it.quality == "":
                            it.quality = urllist[7]
                        it.raw = urllist[1]
                        detaillist.append(it)
                        for ur in urls:
                            u = ur.strip('/')
                            if "douban" in u:
コード例 #6
0
ファイル: update2db.py プロジェクト: ylbupt/movie
            continue
        elif "imdb" in url:
            if url not in  imdburlmap:
                #print "get imdb", url
                imdburlmap[url]=1
            continue

        try:
            if "http://" in url:
                print "craw link",url
                page=utils.crawl_timeout(url,15,3)
             #       print "xxx",lurl
                if page ==None:
                    print "ERROR:",url
                else: 
                    handler = get_site_handler(url,parser)
                    it,urls = handler.parse(url,page)
                    if it!=None:
                        if it.cname ==None or it.cname =="":
                            it.cname = urllist[2]
                            it.ename = urllist[3]
                            it.date = urllist[4]
                        it.mid = urllist[5]
                        it.imdbid = urllist[6]
                        if it.quality =="":
                            it.quality = urllist[7]
                        it.raw = urllist[1]
                        detaillist.append(it)
                        for ur in urls:
                            u = ur.strip('/')
                            if "douban" in u:
コード例 #7
0
ファイル: get_movies.py プロジェクト: ylbupt/movie
if __name__ == "__main__":
    try:
        mmap = {}
        urlfile = sys.argv[2]
        output_url = sys.argv[3]
 #       output_link = sys.argv[3]
 #       pic_dir = sys.argv[4]
        parser = parse.Parser()
        parser.init(sys.argv[1])
        mlist = [] 
        for line in open(urlfile,'r'):
            flist = line.strip().split('\t')
            url = flist[0]
            quality = flist[1]
            print url
            handler = site_handler.get_site_handler(url,parser) 
            page = utils.crawl_timeout(url,15,3)
            if  page !=None:
                cclist = handler.dir_parse(url,page)
                for m in cclist:
                    m.quality = quality
                mlist.extend(cclist)
    
        for m in mlist:
            print m.url
            print m.raw
            print m.cname
            print m.ename
        havelist = Link.objects.filter(urlmd5__in=[utils.get_md5_value(it.url) for it in mlist])
        linkmap = { i.url:i for i in havelist}
        linklist = []