Example #1
0
def parseDailyMovie(dl):
    try:
        page = urllib2.urlopen(dl.link)
        content = page.read()         
        content = content.decode('gb18030').encode('utf8')
        page.close() 
    except URLError:        
        raise
    
    index = content.find("<div id=\"content\">")+18
    index2 = content.find("</div>",index)
    content = content[index:index2]    
    movies = content.split("<br />\r\n<br />\r\n")
    mi = 1
    p = re.compile('<IMG class="postimg" src=".*" />',re.IGNORECASE);
    pl = re.compile('<A href=".*" target=_blank >\*\*\*\*\*點此下載\*\*\*\*\*</A>',re.IGNORECASE);
    for movie in movies:
        #logger.info("movie "+str(mi)+" :\n" + movie)
        #logger.info("movie "+str(mi)+"***************************************************************")
        mi = mi + 1          
        movie = movie.strip()
        if len(movie) < 20:
            continue
        #create the movielink object
        digestkey = hashlib.sha224(movie).hexdigest()
        tIndex = movie.find("<br />")
        mTitle = movie[0:tIndex] 
        #find all images        
        
        images = []
        for match in p.finditer(movie):
            image = str(match.group())
            iIndex = image.find('src="')+5
            iIndex2 = image.find('"',iIndex)            
            image = image[iIndex:iIndex2]
            #logger.info("movie: "+mTitle+" image:"+image)
            images.append(image)            
        imagesLink = ";".join(images)
        
        dls = []
        for match in pl.finditer(movie):
            dlink = str(match.group())
            iIndex = dlink.find('href="')+6
            iIndex2 = dlink.find('"',iIndex)            
            dlink = dlink[iIndex:iIndex2]
            dls.append(dlink)            
        dlLinks = ";".join(dls)
        
        
        result = MovieLink.objects.filter(digestkey=digestkey)               
        if len(result) == 0:              
            ml = MovieLink(title = mTitle,raw_desc = movie,digestkey = digestkey,daily_link=dl,images=imagesLink,downloadlink=dlLinks)            
            ml.save()            
        else:
            logger.info("movie already existed:...." + mTitle)             
    dl.parsed = True
    dl.save()
Example #2
0
def parseDailyMovie(dl):
    try:
        page = urllib2.urlopen(dl.link)
        content = page.read()         
        content = content.decode('gb18030').encode('utf8')
        page.close() 
    except URLError:        
        raise
    
    index = content.find("<div id=\"content\">")+18
    index2 = content.find("</div>",index)
    content = content[index:index2]    
    movies = content.split("<br />\r\n<br />\r\n")
    mi = 1
    p = re.compile('<IMG class="postimg" src=".*" />',re.IGNORECASE);
    pl = re.compile('<A href=".*" target=_blank >\*\*\*\*\*點此下載\*\*\*\*\*</A>',re.IGNORECASE);
    for movie in movies:
        #logger.info("movie "+str(mi)+" :\n" + movie)
        #logger.info("movie "+str(mi)+"***************************************************************")
        mi = mi + 1          
        movie = movie.strip()
        if len(movie) < 20:
            continue
        #create the movielink object
        digestkey = hashlib.sha224(movie).hexdigest()
        tIndex = movie.find("<br />")
        mTitle = movie[0:tIndex] 
        #find all images        
        
        images = []
        for match in p.finditer(movie):
            image = str(match.group())
            iIndex = image.find('src="')+5
            iIndex2 = image.find('"',iIndex)            
            image = image[iIndex:iIndex2]
            #logger.info("movie: "+mTitle+" image:"+image)
            images.append(image)            
        imagesLink = ";".join(images)
        
        dls = []
        for match in pl.finditer(movie):
            dlink = str(match.group())
            iIndex = dlink.find('href="')+6
            iIndex2 = dlink.find('"',iIndex)            
            dlink = dlink[iIndex:iIndex2]
            dls.append(dlink)            
        dlLinks = ";".join(dls)
        
        
        result = MovieLink.objects.filter(digestkey=digestkey)               
        if len(result) == 0:              
            ml = MovieLink(title = mTitle,raw_desc = movie,digestkey = digestkey,daily_link=dl,images=imagesLink,downloadlink=dlLinks)            
            ml.save()            
        else:
            logger.info("movie already existed:...." + mTitle)             
    dl.parsed = True
    dl.save()
Example #3
0
def parserDaily():
    unparsed_dailys = DailyLink.objects.filter(parsed=False)
    #unparsed_dailys = DailyLink.objects.filter(id__gte=1, id__lt=10)
    #unparsed_dailys = DailyLink.objects.all()
    for daily in unparsed_dailys:
        HTML = ''
        if daily.raw_desc:
            HTML = daily.raw_desc
        else:
            HTML = getHTML(daily.link)
            daily.raw_desc = HTML

        if HTML:
            dcp = DailyCollectionParser()
            dcp.feed(HTML)
            for movie in dcp.all_movies:
                desc = ''
                digestkey = ''
                title = ''
                if movie.desc:
                    title = movie.desc[0]
                    desc = '\r\n'.join(movie.desc)
                    desc = desc.strip()
                    #import chardet
                    #print chardet.detect(desc)
                    #digestkey = hashlib.sha256(desc.decode('utf-8')).hexdigest()
                images = ';'.join(movie.imgs)
                downloadlink = ';'.join(movie.links)
                digestkey = hashlib.sha256(downloadlink).hexdigest()
                result = MovieLink.objects.filter(digestkey=digestkey)
                if not len(result):
                    # didn't exists same movie.
                    ml = MovieLink(title=title,
                                   raw_desc=desc,
                                   digestkey=digestkey,
                                   daily_link=daily,
                                   images=images,
                                   downloadlink=downloadlink)
                    ml.save()
                    '''
                    try:
                        ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink)
                        ml.save()
                    except Exception, e:
                        print '[%s]%s' % (title, str(e))
                        exit(1)
                    '''

        daily.parsed = True
        daily.save()
Example #4
0
def parserDaily():
    unparsed_dailys = DailyLink.objects.filter(parsed=False)
    #unparsed_dailys = DailyLink.objects.filter(id__gte=1, id__lt=10)
    #unparsed_dailys = DailyLink.objects.all()
    for daily in unparsed_dailys:
        HTML = ''
        if daily.raw_desc:
            HTML = daily.raw_desc
        else:
            HTML = getHTML(daily.link)
            daily.raw_desc = HTML

        if HTML:
            dcp = DailyCollectionParser()
            dcp.feed(HTML)
            for movie in dcp.all_movies:
                desc = ''
                digestkey = ''
                title = ''
                if movie.desc:
                    title = movie.desc[0]
                    desc = '\r\n'.join(movie.desc)
                    desc = desc.strip()
                    #import chardet
                    #print chardet.detect(desc)
                    #digestkey = hashlib.sha256(desc.decode('utf-8')).hexdigest()
                images = ';'.join(movie.imgs)
                downloadlink = ';'.join(movie.links)
                digestkey = hashlib.sha256(downloadlink).hexdigest()
                result = MovieLink.objects.filter(digestkey=digestkey)
                if not len(result):
                    # didn't exists same movie.
                    ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink)
                    ml.save()
                    '''
                    try:
                        ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink)
                        ml.save()
                    except Exception, e:
                        print '[%s]%s' % (title, str(e))
                        exit(1)
                    '''

        daily.parsed = True
        daily.save()