def parserDaily(): unparsed_dailys = DailyLink.objects.filter(parsed=False) #unparsed_dailys = DailyLink.objects.filter(id__gte=1, id__lt=10) #unparsed_dailys = DailyLink.objects.all() for daily in unparsed_dailys: HTML = '' if daily.raw_desc: HTML = daily.raw_desc else: HTML = getHTML(daily.link) daily.raw_desc = HTML if HTML: dcp = DailyCollectionParser() dcp.feed(HTML) for movie in dcp.all_movies: desc = '' digestkey = '' title = '' if movie.desc: title = movie.desc[0] desc = '\r\n'.join(movie.desc) desc = desc.strip() #import chardet #print chardet.detect(desc) #digestkey = hashlib.sha256(desc.decode('utf-8')).hexdigest() images = ';'.join(movie.imgs) downloadlink = ';'.join(movie.links) digestkey = hashlib.sha256(downloadlink).hexdigest() result = MovieLink.objects.filter(digestkey=digestkey) if not len(result): # didn't exists same movie. ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink) ml.save() ''' try: ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink) ml.save() except Exception, e: print '[%s]%s' % (title, str(e)) exit(1) ''' daily.parsed = True daily.save()
def debugdaily(): #unparsed_dailys = DailyLink.objects.filter(parsed=False) dl = DailyLink.objects.get(id=4) import chardet import re dcp = DailyCollectionParser() dcp.feed(dl.raw_desc) f = open('bt_parser.txt', 'w') for movie in dcp.all_movies: #f.write(' ^ '.join(movie.desc.encode('utf-8'))) dd = [] for d in movie.desc: dd.append(d.encode('utf-8')) f.write(' ^ '.join(dd)) f.write('\r\n=======================\r\n') f.write(';'.join(movie.imgs)) f.write('\r\n=======================\r\n') f.write(';'.join(movie.links)) f.write("\r\n\r\n***********************\r\n\r\n") f.close()