Ejemplo n.º 1
0
 def traverse(self, text):
     while True:
         l = util.findall(text, "{{")
         r = util.findall(text, "}}")
         if len(l) < len(r):
             text = text[:-2]
             r.pop()
         elif len(l) == 0:
             break
         else:
             temp = self.careful_sub(text[l[-1]:r[0]+2], True)
             text = text.replace(text[l[-1]:r[0]+2], temp)
     text = text.replace("~", "\n")
     text = self.final_sub(text)
     text = text.replace("`", "{").replace("@@", "}")
     return text
Ejemplo n.º 2
0
def movie_expression(context, censored_info):
    result_msg.uncensored = censored_info
    result_msg.reset()

    movies_url_tr = r'<a class="movie-box" href="(.*?)">'  # web_context = url

    movie_results = findall(movies_url_tr, context)
    result_msg.total_count = len(movie_results)
    tasks = []
    for line in movie_results:
        process_task(line, context)
        #     new_task = threading.Thread(target=process_task, args=(line, context,))
        #     tasks.append(new_task)
        #
        # for task_item in tasks:
        #     if not task_item.isAlive():
        #         task_item.setDaemon(True)
        #         task_item.start()
        # while result_msg.procced_count == result_msg.total_count:
        #     return result_msg
        # process_task(line, context)
        # procced_count += 1
        # print "Processing: " + str(procced_count) + "/" + str(result_msg.total_count) + " \t" + " saved: " \
        #       + str(result_msg.movie_count) + " uncensored: " + result_msg.uncensored
        # new_movie = detail_expression(line)
        # if new_movie.movie_img_url == None:
        #     movie_img_url = r'<a class="movie-box" href="' + line \
        #                     + '">[\s|\S]{70,80}<img src="(https://.{4,64}/thumb\w{0,1}/[\w|\.]{1,16})"'
        #     new_movie.movie_img_url = search_str(movie_img_url, context)
        #     new_movie.save()
        #     print new_movie.movie_img_url + " preview image saved."
        # print str(procced_count) + "/" + str(result_msg.total_count) + " end\n"
    return result_msg
Ejemplo n.º 3
0
 def traverse(self, text):
     while True:
         l = util.findall(text, "{{")
         r = util.findall(text, "}}")
         if len(l) < len(r):
             text = text[:-2]
             r.pop()
         elif len(l) == 0:
             break
         else:
             temp = self.careful_sub(text[l[-1]:r[0] + 2], True)
             text = text.replace(text[l[-1]:r[0] + 2], temp)
     text = text.replace("~", "\n")
     text = self.final_sub(text)
     text = text.replace("`", "{").replace("@@", "}")
     return text
Ejemplo n.º 4
0
def get_magnets(movie_info):
    # url中有网络接口添加的相关认证
    floor = int(math.floor(random.random() * 1e3 + 1))
    magnet_list_url = r'https://www.seedmm.com/ajax/uncledatoolsbyajax.php?gid=' + movie_info.gid + '&lang=zh&img=' + movie_info.img + '&uc=' + movie_info.uc + '&floor=' + str(
        floor)
    magnet_context = web_content(magnet_list_url)

    magnet_str = r'<tr.*>\s*<td.*>\s*<a.*(magnet.*?)">\s*(.*?)\s*</a>\s*</td>\s*<td.*>\s*<a.*">\s*(.*?)\s*</a>\s*</td>\s*<td.*>\s*<a.*">\s*(.*?)\s*</a>\s*</td>\s*</tr>'
    magnet_result = findall(magnet_str, magnet_context)
    magnet_count = 0

    for line in magnet_result:
        magnets = models.Magnet(title=movie_info.title,
                                movie=movie_info,
                                magnet_url=line[0])

        if len(search_str(r'(.*?)<a\s', line[1])) > 1:
            magnets.file_name = sub_filename(search_str(r'(.*?)<a\s', line[1]))
            magnets.hd = 'y'
        else:
            magnets.file_name = sub_filename(search_str(r'(.*?)<a\s', line[1]))

        # fix bug for strange encode.
        if chardet.detect(sub_filename(search_str(
                r'(.*?)<a\s', line[1])))['confidence'] == 0:
            magnets.file_name = movie_info.title
        magnets.size = line[2]
        magnets.share_date = line[3]
        magnets.save()
        movie_info.mag_count += 1
        movie_info.save()
        magnet_count += 1
        result_msg.resource_count += 1
        # for property, value in vars(magnets).iteritems():
        #     print property, ": ", value
    message = "" + movie_info.no + " find: " + str(magnet_count) + " resource"
    print message
    return message
Ejemplo n.º 5
0
def detail_expression(detail_url):
    new_movies = models.Movie.objects.filter(movie_url=detail_url)
    print detail_url

    if new_movies:
        return new_movies.first()
    new_movie = models.Movie(movie_url=detail_url)

    detail_context = web_content(detail_url)

    title_str = r'<h3>(.*?)</h3>'
    no_str = r'<p><span class="header">識別碼:</span> <span style="color:#CC0000;">(.*?)</span></p>'
    date_str = r'<p><span class="header">發行日期:</span> (.*?)</p>'
    length_str = r'<p><span class="header">長度:</span> (.*?)</p>'
    director_str = r'<p><span class="header">導演:</span> <a href="https://www.seedmm.com/director/\w{1,64}">(.*?)</a></p>'
    maker_str = r'<p><span class="header">製作商:</span> <a href="https://www.seedmm.com/studio/\w{1,64}">(.*?)</a></p>'
    series_str = r'<p><span class="header">系列:</span> <a href="(https://www.seedmm.com/series/\w{1,64}?)">(.*?)</a></p>'
    tag_str = r'<span class="genre"><a href="(https://www.seedmm.com/.*?)">(.{1,64}?)</a></span>'
    if result_msg.uncensored == 'y':
        actors_url_str = r'<a href="(https://www.seedmm.com/uncensored/star/\w{1,16})">(.{1,64}?)</a>'
    else:
        actors_url_str = r'<a href="(https://www.seedmm.com/star/\w{1,16})">(.{1,64}?)</a>'

    gid_str = r'var gid = (.*?);'
    uc_str = r'var uc = (.*?);'
    img_str = r'var img = \'(.*?)\';'

    new_movie.title = search_str(title_str, detail_context)
    new_movie.no = search_str(no_str, detail_context)
    new_movie.date = search_str(date_str, detail_context)
    new_movie.length = search_str(length_str, detail_context)
    new_movie.director = sub_filename(search_str(director_str, detail_context))
    new_movie.maker = search_str(maker_str, detail_context)

    new_movie.gid = search_str(gid_str, detail_context)
    new_movie.uc = search_str(uc_str, detail_context)
    new_movie.img = search_str(img_str, detail_context)
    new_movie.uncensored = result_msg.uncensored

    series_results = findall(series_str, detail_context)
    for series_info in series_results:
        series_list = models.Series.objects.filter(series_url=series_info[0],
                                                   name=series_info[1])
        if not series_list:
            series = models.Series(series_url=series_info[0],
                                   name=series_info[1])
            series.save()
            print "Series " + series.name + " saved"
        else:
            series = series_list.first()
        new_movie.series = series

    new_movie.save()

    message = "Movie: " + new_movie.no + " saved!"
    print message
    result_msg.movie_count += 1

    # 根据Movie信息更新magnet表
    get_magnets(new_movie)

    actors_results = findall(actors_url_str, detail_context)
    for line in actors_results:
        new_actor = actor_expression(line[0], line[1])
        models.Movie_Actor(movie=new_movie, actor=new_actor).save()

    tag_results = findall(tag_str, detail_context)
    for line in tag_results:
        new_tag = tag_expression(line[0], line[1])
        models.Movie_Tag(movie=new_movie, tag=new_tag).save()
        new_movie.check_done = 'y'
        new_movie.save()

    # for property, value in vars(movie).iteritems():
    #     print property, ": ", value

    new_movie.check_date = datetime.datetime.now().replace(tzinfo=utc)
    new_movie.save()

    return new_movie
Ejemplo n.º 6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Script Name	: test
# Author		: badger
# Created		: 2017/7/16
# Description	:
import math
import random

import datetime

from django.utils.timezone import utc

import util

import web_crawler

# floor = int(math.floor(random.random() *1e3 + 1))
#
full_str = r'<span class="genre"><a href="https://www.seedmm.com/genre/3u">特典あり(AVベースボール)</a></span>'
str = r'<span class="genre"><a href="(https://www.seedmm.com/.*?)">(.{1,64}?)</a></span>'

print util.findall(str, full_str)

# url =r'https://www.seedmm.com/ajax/uncledatoolsbyajax.php?gid=6747707738&lang=zh&img=https://images.javbus.info/cover/10jr_b.jpg&uc=1&floor='+str(floor)
# print url
# print web_crawler.web_content(url)

# print datetime.datetime.now().replace(tzinfo=utc)
Ejemplo n.º 7
0
def getAllUrls(text):
    pattern = "\'http:.*?\'|\"http:.*?\""
    urls = util.findall(pattern,text)
    return urls