Ejemplo n.º 1
0
def init(_genre):
    # load setting file
    global genre
    global setting
    global ignores
    global api
    global vision
    global tweets
    global themes
    global users
    global infos
    
    genre = _genre
    with open('settings.yaml') as f:
        fcntl.flock(f, fcntl.LOCK_SH)
        setting = yaml.load(f)[genre]
    
    with open(setting['ignores']) as f:
        fcntl.flock(f, fcntl.LOCK_SH)
        ignores = yaml.load(f)

    # prepare twitter object
    api = get_api(setting['rt_account'])

    #  prepare Google Cloud Vision API
    with open('.google-api-key') as f:
        key = f.read().strip()
    vision = build('vision', 'v1', developerKey=key)
    
    tweets = get_mongo_client()[genre + '_1draw_collections']['tweets']
    themes = get_mongo_client()[genre + '_1draw_collections']['themes']
    users = get_mongo_client()[genre + '_1draw_collections']['users']
    infos = get_mongo_client()[genre + '_1draw_collections']['infos']
Ejemplo n.º 2
0
def insert_docs(docs):
    ids = []
    c = get_mongo_client()[config['target']].entries
    for doc in docs:
        if c.find({'_id': doc['_id']}).count():
            continue
        res = c.insert_one(doc)
        ids.append(res.inserted_id)
    return ids
Ejemplo n.º 3
0
def index(request):
    db = get_mongo_client().kinpri_theater_checker
    last_updated = db.theaters.find().sort([('last_updated', -1)
                                            ]).limit(1)[0]['last_updated']
    today = datetime.datetime.fromordinal(datetime.date.today().toordinal())
    day_num = 4
    days = [today + datetime.timedelta(days=i) for i in range(day_num)]
    theater_shows = [(theater,
                      [make_shows(db, theater['name'], day) for day in days])
                     for theater in db.theaters.find()]
    support_theater_num = len(db.shows_latest.distinct('theater'))
    present_total_theater_num = db.theaters.find({
        'start_date': parse('6/10')
    }).count()
    total_theater_num = db.theaters.find().count()
    return render(
        request, 'kinpri_theater_checker/index.html', {
            'last_updated': last_updated,
            'days': days,
            'theater_shows': theater_shows,
            'support_theater_num': support_theater_num,
            'total_theater_num': total_theater_num,
            'present_total_theater_num': present_total_theater_num,
        })
Ejemplo n.º 4
0
class MovixSpider(scrapy.Spider):
    name = 'movix'
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        },
        'CONCURRENT_REQUESTS': 2,
    }
    allowed_domains = ['smt-cinema.com']

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters
    script = '''
treat = require("treat")

function get_movie(splash, i)
    local button = splash.execjs(
        "document.querySelectorAll('.scrollDate:not(.nonactive)')[" .. i .. "]")
    button.mouse_click()
    local res = {
        html = splash:html(),
        ok = true,
    }
    return res
end

function main(splash)
    local days = splash:execjs(
        "document.querySelectorAll('.scrollDate:not(.nonactive)').length")
    local movies = treat.as_array({})
    for i = 0, days - 1 do
        movies[i] = get_movie(splash, i)
    end
    return movies
end
'''

    def start_requests(self):
        theater_regex = re.compile('|'.join(self.allowed_domains))
        start_urls = [t['link'] for t in self.db.find({'link': theater_regex})]
        for url in start_urls:
            yield SplashRequest(
                url=url,
                callback=self.parse,
                args={'wait': 3},
            )

    def parse(self, response):
        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        date = response.css('#Day_schedule h1::text').extract_first()
        movies = response.css('.scheduleBox')
        for movie in movies:
            title = movie.css('h2 ::text').extract_first()

            # skip the movie is not kinpri
            if not utils.is_title_kinpri(title):
                continue

            shows = movie.css('.scheduleBox>table>tbody>tr>td')
            for s in shows:
                show = Show()
                show['updated'] = datetime.datetime.now()
                show['theater'] = theater
                show['schedule_url'] = response.url
                show['date'] = date
                show['title'] = title
                show['movie_types'] = utils.get_kinpri_types(title)
                screen = s.css('p::text').re(r'\d+')
                if not screen:
                    break
                show['screen'] = screen[0]
                show['start_time'] = s.css('span::text').extract_first()
                show['end_time'] = s.css('tr>td::text').re(r'\d+:\d+')[0]
                show['ticket_state'] = s.css('img::attr(alt)').extract_first()
                reservation_url = s.css('td[onclick]::attr(onclick)')
                if reservation_url:
                    reservation_url = reservation_url.re(r"'(https://.+?)'")[0]
                    yield scrapy.Request(
                        url=reservation_url,
                        callback=self.parse_reservation,
                        meta={'show': show},
                    )
                else:
                    show['remaining_seats_num'] = 0
                    show['total_seats_num'] = None
                    show['reserved_seats'] = None
                    show['remaining_seats'] = []
                    show['reservation_url'] = None
                    yield show

    def parse_reservation(self, response):
        show = response.meta['show']
        seats = response.css('#choice td a.tip')
        remainings = []
        reserveds = []
        for seat in seats:
            if seat.css('img[src*="seat_no"]'):
                id = seat.css('::attr(title)').extract_first()
                reserveds.append(id)
            elif seat.css('img[src*="seat_off"]'):
                id = seat.css('::attr(title)').extract_first()
                remainings.append(id)
        show['remaining_seats_num'] = len(remainings)
        show['total_seats_num'] = len(remainings) + len(reserveds)
        show['reserved_seats'] = reserveds
        show['remaining_seats'] = remainings
        yield show
 def open_spider(self, spider):
     self.client = get_mongo_client()
     self.db = self.client[self.mongo_db]
Ejemplo n.º 6
0
    return db[collection].update_one({'_id': id}, {'$inc': {
        key: value
    }},
                                     upsert=True)


def get_value_db(collection, id, key):
    doc = db[collection].find_one({'_id': id})
    if doc:
        return doc.get(key)
    else:
        return None


# prepare db
db = get_mongo_client().nanami_kyupikon
kyupikon_db = redis.Redis()
kyupikons_queue_name = 'twitter_nanami_kyupiko_kyupikons_queue'
kyupikons_reply_queue_name = 'twitter_nanami_kyupiko_kyupikons_reply_queue'

# parse args
parser = argparse.ArgumentParser()
parser.add_argument('--debug',
                    action='store_true',
                    help='enable debug mode to avoid actual tweeting')
parser.add_argument('--reset_counts',
                    action='store_true',
                    help='reset reply counts database')
args = parser.parse_args()

# prepare api object
Ejemplo n.º 7
0
class CinecittaSpider(scrapy.Spider):
    name = "cinecitta"
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        },
    }
    allowed_domains = ['cinecitta.co.jp', 'cinecitta.jp']

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters
    theater_regex = re.compile(r'cinecitta.co.jp')
    start_urls = [t['link'] for t in db.find({'link': theater_regex})]

    def parse(self, response):
        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        calendar_url = response.urljoin(response.css('iframe::attr(src)').extract_first())
        yield scrapy.Request(url=calendar_url,
                             callback=self.parse_calendar,
                             meta={'theater': theater})


    def parse_calendar(self, response):
        urls_dates = zip(
            response.css('a::attr(href)').extract(),
            response.css('a::text').extract())
        for url, date in urls_dates:
            response.meta['date'] = date
            yield response.request.replace(url=url,
                                           callback=self.parse_schedule_iframe,
                                           meta=response.meta)


    def parse_schedule_iframe(self, response):
        url = response.urljoin(response.css('#ifrParent::attr(src)').extract_first())
        yield response.request.replace(url=url, callback=self.parse_schedule)


    def parse_schedule(self, response):
        movies = response.css('table.movietitle')
        for movie in movies:
            title = movie.css('.item1 ::text').extract_first()
            
            # skip the movie is not kinpri
            if not utils.is_title_kinpri(title):
                continue

            screen = movie.css('a.theaterlink::text').re(r'\d+')[0]
            shows = movie.css('.time1, .time2')
            for s in shows:
                show = Show()
                show['updated'] = datetime.datetime.now()
                show['theater'] = response.meta['theater']
                show['schedule_url'] = response.url
                show['date'] = response.meta['date']
                show['title'] = title
                show['screen'] = screen
                show['movie_types'] = utils.get_kinpri_types(title)
                times = s.css('span::text').re(r'\d+:\d+')
                if not times:
                    break
                show['start_time'] = times[0]
                show['end_time'] = times[1]
                show['ticket_state'] = s.css('img::attr(src)').extract()[-1]
                show['reservation_url'] = s.css('a::attr(href)').extract_first()
                # TODO: check seats
                # if reservation_url:
                #     
                #     yield scrapy.Request(url=reservation_url,
                #                          callback=self.parse_reservation,
                #                          meta={'show': show},
                #     )
                # else: 
                show['remaining_seats_num'] = 0
                show['total_seats_num'] = None
                show['reserved_seats'] = None
                show['remaining_seats'] = []
                show['reservation_url'] = None
                yield show


    def parse_check_continue(self, response):
        if '購入途中' in response.css('h2::text').extract_first():
            url = response.urljoin(response.css('form::attr(action)').extract()[-1])
            yield response.request.replace(url=url,
                                           callback=self.parse_reservation,
                                           method='POST',
                                           body='rm=start')
        yield response.request.replace(callback=self.parse_reservation)


    def parse_reservation(self, response):
        show = response.meta['show']
        remainings = [s.css('::attr(id)').extract_first()
                     for s in response.css('#view_seat td[value="0"]')]
        reserveds = [s.css('::attr(id)').extract_first()
                     for s in response.css('#view_seat td[value="1"]')]
        show['remaining_seats_num'] = len(remainings)
        show['total_seats_num'] = len(remainings) + len(reserveds)
        show['reserved_seats'] = reserveds
        show['remaining_seats'] = remainings
        yield show
    with open('settings.yaml') as f:
        settings = yaml.load(f).get(account)
    if not settings:
        raise ValueError('There is no account name', account)
    return settings

if __name__ == '__main__':
    print(datetime.datetime.now())

    IGNORE_USERS = get_ignore_users()
    IGNORE_DATES = get_ignore_dates()
    IGNORE_IDS = get_ignore_ids()
    
    parser = argparse.ArgumentParser()
    parser.add_argument('account')
    parser.add_argument('command', choices=['retweet', 'update_themes'])
    parser.add_argument('--dry-run', action='store_true')
    args = parser.parse_args()

    settings = get_settings(args.account)
    print(settings)
    api = get_api(settings['rt_bot_screen_name'])
    tag = settings['tag']
    tws = get_mongo_client()[settings['db_name']].tweets
    ths = get_mongo_client()[settings['db_name']].themes

    if args.command == 'retweet':
        retweet()
    elif args.command == 'update_themes':
        update_themes()
Ejemplo n.º 9
0
                            'print_tomorrow',
                            'tweet_date',
                            'print_date',
                            'retweet',
                            'follow',
                            'run_command_from_tos',
                        ])
    parser.add_argument('--date')
    parser.add_argument('--delta', type=int, default=1)
    parser.add_argument('--screen_names', nargs='+')  # for retweet
    parser.add_argument('--ids', nargs='+')  # for retweet

    args = parser.parse_args()

    # prepare database
    c = get_mongo_client().kinpri_goods_wiki

    # get tweepy api
    if args.debug:
        api = get_api('sakuramochi_pre')
    else:
        api = get_api('goods_yamada')

    # run command

    # today
    if args.command == 'tweet_today':
        tweet_date_items(get_date())

    elif args.command == 'print_today':
        print_date_items(get_date())
Ejemplo n.º 10
0
        settings = yaml.load(f).get(account)
    if not settings:
        raise ValueError('There is no account name', account)
    return settings


if __name__ == '__main__':
    print(datetime.datetime.now())

    IGNORE_USERS = get_ignore_users()
    IGNORE_DATES = get_ignore_dates()
    IGNORE_IDS = get_ignore_ids()

    parser = argparse.ArgumentParser()
    parser.add_argument('account')
    parser.add_argument('command', choices=['retweet', 'update_themes'])
    parser.add_argument('--dry-run', action='store_true')
    args = parser.parse_args()

    settings = get_settings(args.account)
    print(settings)
    api = get_api(settings['rt_bot_screen_name'])
    tag = settings['tag']
    tws = get_mongo_client()[settings['db_name']].tweets
    ths = get_mongo_client()[settings['db_name']].themes

    if args.command == 'retweet':
        retweet()
    elif args.command == 'update_themes':
        update_themes()
Ejemplo n.º 11
0
    parser.add_argument('command', type=str, choices=[
        'tweet_today', 'print_today',
        'tweet_tomorrow', 'print_tomorrow',
        'tweet_date', 'print_date',
        'retweet', 'follow',
        'run_command_from_tos',
    ])
    parser.add_argument('--date')
    parser.add_argument('--delta', type=int, default=1)
    parser.add_argument('--screen_names', nargs='+')  # for retweet
    parser.add_argument('--ids', nargs='+')  # for retweet

    args = parser.parse_args()

    # prepare database
    c = get_mongo_client().kinpri_goods_wiki

    # get tweepy api
    if args.debug:
        api = get_api('sakuramochi_pre')
    else:
        api = get_api('goods_yamada')

    # run command

    # today
    if args.command == 'tweet_today':
        tweet_date_items(get_date())

    elif args.command == 'print_today':
        print_date_items(get_date())
Ejemplo n.º 12
0
class TtcgSpider(scrapy.Spider):
    name = "ttcg"
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        }
    }
    allowed_domains = ['ttcg.jp']

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters
    start_urls = [t['link'] for t in db.find({'link': re.compile(r'ttcg.jp')})]

    def parse(self, response):
        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        url = response.css('#navschedule a::attr(href)').extract_first()
        yield scrapy.Request(url=url,
                             callback=self.parse_schedule,
                             meta={'theater': theater})

    def parse_schedule(self, response):
        date = response.css('.today::text').extract_first()
        movies = response.css('.timeschedule')
        for movie in movies:
            title = movie.css(
                '.mtitle span.fontm::text').extract_first().strip()

            # skip not kinpri
            if not utils.is_title_kinpri(title):
                continue

            shows = movie.css('td')
            for s in shows:
                show = Show()

                show['updated'] = datetime.datetime.now()
                show['title'] = title
                show['movie_types'] = utils.get_kinpri_types(title)
                show['date'] = date
                show['theater'] = response.meta['theater']
                show['schedule_url'] = response.url
                show['start_time'] = s.css('.start ::text').extract_first()
                if not show['start_time']:
                    break
                show['end_time'] = s.css('.end ::text').extract_first()
                show['screen'] = None
                state = s.css('.icon_kuuseki ::text').extract_first()
                show['ticket_state'] = state

                reservation_url = None
                if reservation_url:
                    show['reservation_url'] = reservation_url
                    yield scrapy.Request(url=reservation_url,
                                         callback=self.parse_reservation,
                                         meta={'show': show})
                else:
                    yield show

        next_day_url = response.css('.schehead .b-next a::attr(href)') \
                               .extract_first()
        if not next_day_url == response.url:
            print(next_day_url)
            yield response.request.replace(url=next_day_url)

    # TODO:
    def parse_reservation(self, response):
        show = response.meta['show']
        remaining = [
            s.css('::attr(title)').extract_first()
            for s in response.css('li.seatSell.seatOn')
        ]
        reserved = [
            s.css('::attr(title)').extract_first()
            for s in response.css('li.seatSell.seatOff')
        ]
        show['remaining_seats_num'] = len(remaining)
        show['total_seats_num'] = len(remaining) + len(reserved)
        show['reserved_seats'] = reserved
        show['remaining_seats'] = remaining
        yield show
Ejemplo n.º 13
0
class TohoSpider(scrapy.Spider):
    name = 'toho'
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        },
        'CONCURRENT_REQUESTS': 2,
    }
    allowed_domains = ['tohotheater.jp']

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters

    def start_requests(self):
        theater_regex = re.compile('|'.join(self.allowed_domains))
        start_urls = [t['link'] for t in self.db.find({'link': theater_regex})]
        for url in start_urls:
            yield SplashRequest(
                url=url,
                callback=self.parse,
                args={'wait': 3},
            )

    def parse(self, response):
        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        date = response.css('.schedule-body-day::text').extract_first()
        movies = response.css('.schedule-body-section-item')
        for movie in movies:
            title = movie.css('.schedule-body-title::text').extract_first()

            # skip the movie is not kinpri
            if not utils.is_title_kinpri(title):
                continue

            screens = movie.css('.schedule-screen')
            for s in screens:
                screen = s.css('.schedule-screen-title::text').extract_first()

                shows = s.css('.schedule-item')
                for s in shows:
                    show = Show()
                    show['updated'] = datetime.datetime.now()
                    show['theater'] = theater
                    show['schedule_url'] = response.url
                    show['date'] = date
                    show['title'] = title
                    show['movie_types'] = utils.get_kinpri_types(title)
                    show['screen'] = screen
                    show['start_time'] = s.css(
                        '.time .start::text').extract_first()
                    show['end_time'] = s.css(
                        '.time .end::text').extract_first()
                    show['ticket_state'] = s.css(
                        '.status::attr(class)').extract_first()
                    reservation_url = s.css('a')
                    # if reservation_url:
                    #     reservation_url =
                    #     yield scrapy.Request(url=reservation_url,
                    #                          callback=self.parse_reservation,
                    #                          meta={'show': show},
                    #     )
                    # else:
                    show['remaining_seats_num'] = 0
                    show['total_seats_num'] = None
                    show['reserved_seats'] = None
                    show['remaining_seats'] = []
                    show['reservation_url'] = None
                    yield show

    def parse_reservation(self, response):
        show = response.meta['show']
        seats = response.css('#choice td a.tip')
        remainings = []
        reserveds = []
        for seat in seats:
            if seat.css('img[src*="seat_no"]'):
                id = eat.css('::attr(title)').extract_first()
                reserveds.append(id)
            elif seat.css('img[src*="seat_off"]'):
                id = eat.css('::attr(title)').extract_first()
                remainings.append(id)
        show['remaining_seats_num'] = len(remainings)
        show['total_seats_num'] = len(remainings) + len(reserveds)
        show['reserved_seats'] = reserveds
        show['remaining_seats'] = remainings
        yield show
Ejemplo n.º 14
0
class KinezoSpider(scrapy.Spider):
    name = "kinezo"
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        },
        'USER_AGENT':
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.24 Mobile Safari/537.36 kinpri_theater_checker (+https://skrm.ch/prettyrhythm/kinpri-theater-checker/)',
    }
    allowed_domains = ["kinezo.jp"]

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters
    kinezo_regex = re.compile(r'kinezo.jp|tjoy.net')
    start_urls = [t['link'] for t in db.find({'link': kinezo_regex})]

    def parse(self, response):
        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        event_url = list(
            filter(lambda x: '/event_list' in x,
                   response.css('#headerMenuData a::attr(href)').extract()))[0]
        self.logger.info('event_url: ' + event_url)

        # parse normal list
        movies = response.css('a[name="movieItem"]')
        for movie in movies:
            title = ' '.join(movie.css('span::text').extract())
            if not utils.is_title_kinpri(title):
                continue
            url = movie.css('::attr(href)').extract_first()
            self.logger.info('title: ' + title)
            self.logger.info('url: ' + url)
            yield scrapy.Request(url=url,
                                 callback=self.parse_schedule,
                                 meta={'theater': theater})

    def parse_schedule(self, response):

        schedule_days = response.css('#schedule p[id^="day_"]')
        schedule_list = response.css('.schedule_list ul')
        # self.logger.info('schedule_days: ' + schedule_days.extract_first())
        for day, ul in zip(schedule_days, schedule_list):
            date = day.css('span::text').extract_first()
            for li in ul.css('li'):
                show = Show()

                # skip no schedule day
                state = li.css('::attr(class)').extract_first()
                if not state or state == 'noSchedule':
                    continue

                show['updated'] = datetime.datetime.now()
                title = ' '.join(response.css('.text span::text').extract())
                show['title'] = title
                show['movie_types'] = utils.get_kinpri_types(title)
                show['date'] = date
                show['ticket_state'] = state
                show['theater'] = response.meta['theater']
                show['screen'], time = li.css('::text').extract()
                show['start_time'], show['end_time'] = time.split(' - ')
                show['schedule_url'] = response.url

                reservation_url = li.css('a::attr(href)').extract_first()
                if state == 'sec05':  # soldout
                    show['remaining_seats_num'] = 0
                    show['total_seats_num'] = None
                    show['reserved_seats'] = None
                    show['remaining_seats'] = []
                    show['reservation_url'] = None
                    yield show
                else:
                    show['reservation_url'] = reservation_url
                    yield scrapy.Request(url=reservation_url,
                                         callback=self.parse_reservation,
                                         meta={'show': show})

    def parse_reservation(self, response):
        show = response.meta['show']
        remaining = [
            s.css('::attr(title)').extract_first()
            for s in response.css('li.seatSell.seatOn')
        ]
        reserved = [
            s.css('::attr(title)').extract_first()
            for s in response.css('li.seatSell.seatOff')
        ]
        show['remaining_seats_num'] = len(remaining)
        show['total_seats_num'] = len(remaining) + len(reserved)
        show['reserved_seats'] = reserved
        show['remaining_seats'] = remaining
        yield show
Ejemplo n.º 15
0
        'inform',
        'retweet',
        'fetch_tos',
        'check_replies',
        'convert_birthday_to_csv',
        'update_birthday_spreadsheet',
        'add_ignore_users',
        'remove_ignore_users',
    ])
    parser.add_argument('--users', '-u', nargs='+')
    parser.add_argument('--target_works', nargs='+')
    parser.add_argument('--ids', nargs='+')
    args = parser.parse_args()

    api = get_api(args.account)
    tws = get_mongo_client()[api.auth.username].tweets
    replies = get_mongo_client()[api.auth.username].replies
    
    if args.command == 'inform':
        inform()
    elif args.command == 'retweet':
        if args.ids:
            retweet(args.ids)
        else:
            retweet()
    elif args.command == 'fetch_tos':
        fetch_tos()
    elif args.command == 'check_replies':
        check_replies()
    elif args.command == 'convert_birthday_to_csv':
        convert_birthday_to_csv()
class AeoncinemaSpider(scrapy.Spider):
    name = "aeoncinema"
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        }
    }
    allowed_domains = ["aeoncinema.com"]

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters
    aeoncinema_regex = re.compile(r'aeoncinema.com')
    start_urls = [t['link'] for t in db.find({'link': aeoncinema_regex})]


    def parse(self, response):
        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        url = response.css('li.schedule a::attr(href)').extract_first()
        yield scrapy.Request(url=url, callback=self.parse_schedule,
                             meta={'theater': theater})


    def parse_schedule(self, response):

        date = response.css('.today::text').extract_first()
        movies = response.css('.movielist')
        for movie in movies:
            title = movie.css('.main a::text').extract_first().strip()
            
            # skip not kinpri
            if not utils.is_title_kinpri(title):
                continue

            shows = movie.css('.timetbl [class^="tbl"]')[1:]
            for s in shows:
                show = Show()

                show['updated'] = datetime.datetime.now()
                show['title'] = title
                show['movie_types'] = utils.get_kinpri_types(title)
                show['date'] = date
                show['theater'] = response.meta['theater']
                show['schedule_url'] = response.url

                start_time, end_time = s.css('.time ::text').extract()
                show['start_time'] = ':'.join(re.findall(r'(\d{1,2})', start_time))
                show['end_time'] = ':'.join(re.findall(r'(\d{1,2})', end_time))
                show['screen'] = s.css('.screen ::text').extract_first()
                state = s.css('.icon_kuuseki ::text').extract_first()
                show['ticket_state'] = state

                # TODO: run javascript via splash
                reservation_url = None
                if reservation_url:
                    show['reservation_url'] = reservation_url
                    yield scrapy.Request(url=reservation_url,
                                         callback=self.parse_reservation,
                                         meta={'show': show})
                else:
                    yield show


    # TODO: 
    def parse_reservation(self, response):
        show = response.meta['show']
        remaining = [s.css('::attr(title)').extract_first()
                     for s in response.css('li.seatSell.seatOn')]
        reserved = [s.css('::attr(title)').extract_first()
                    for s in response.css('li.seatSell.seatOff')]
        show['remaining_seats_num'] = len(remaining)
        show['total_seats_num'] = len(remaining) + len(reserved)
        show['reserved_seats'] = reserved
        show['remaining_seats'] = remaining
        yield show
Ejemplo n.º 17
0
def tweet_new_docs():
    c = get_mongo_client()[config['target']].entries
    docs = c.find({'meta.tweeted': False}).sort('_id')
    for doc in docs:
        success_id = tweet_doc(doc)
        c.update_one({'_id': success_id}, {'$set': {'meta.tweeted': True}})
class KinezoSpider(scrapy.Spider):
    name = "unitedcinemas"
    custom_settings = {
        'ITEM_PIPELINES': {
            'kinpri_theater_checker.pipelines.ShowPipeline': 300,
        },
    }
    allowed_domains = ["unitedcinemas.jp"]

    # prepare start_urls
    db = get_mongo_client().kinpri_theater_checker.theaters
    theater_regex = re.compile(r'unitedcinemas.jp')
    start_urls = [t['link'] for t in db.find({'link': theater_regex})]

    def parse(self, response):
        # TODO: create start_requests() and get theater & url in it

        # get theater name
        if 'redirect_urls' in response.request.meta:
            request_url = response.request.meta['redirect_urls'][0]
        else:
            request_url = response.url
        theater = self.db.find_one({'link': request_url}).get('name')

        urls = response.css('#carouselCalendar li a::attr(href)').extract()
        for url in urls:
            next_url = response.urljoin(url)
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_schedule,
                                 meta={'theater': theater})

    def parse_schedule(self, response):
        date = response.url
        movies = response.css('#dailyList>li')
        for movie in movies:
            title = movie.css('.movieTitle a::text').extract_first()

            # skip the movie is not kinpri
            if not utils.is_title_kinpri(title):
                continue

            shows_rows = movie.css('.tl>li')
            for shows_row in shows_rows:
                screen = shows_row.css('.screenNumber img::attr(alt)').re(
                    r'(\d+)')[0]
                shows = shows_row.css('div')
                for s in shows:
                    show = Show()
                    show['updated'] = datetime.datetime.now()
                    show['theater'] = response.meta['theater']
                    show['schedule_url'] = response.url
                    show['date'] = date
                    show['title'] = title
                    show['screen'] = screen
                    show['movie_types'] = utils.get_kinpri_types(title)
                    show['start_time'] = s.css(
                        '.startTime::text').extract_first()
                    show['end_time'] = s.css('.endTime::text').extract_first()
                    state = s.css('.uolIcon .scheduleIcon::attr(alt)').re(
                        r'\[(.)\]')
                    if state:
                        show['ticket_state'] = state[0]
                    else:
                        show['ticket_state'] = None
                    reservation_url = movie.css(
                        '.uolIcon a::attr(href)').extract_first()
                    if reservation_url:
                        show['reservation_url'] = reservation_url
                        yield scrapy.Request(
                            url=reservation_url,
                            callback=self.parse_check_continue,
                            meta={'show': show},
                            dont_filter=True,
                        )
                    else:
                        show['remaining_seats_num'] = 0
                        show['total_seats_num'] = None
                        show['reserved_seats'] = None
                        show['remaining_seats'] = []
                        show['reservation_url'] = None
                        yield show

    def parse_check_continue(self, response):
        if '購入途中' in response.css('h2::text').extract_first():
            url = response.urljoin(
                response.css('form::attr(action)').extract()[-1])
            yield response.request.replace(url=url,
                                           callback=self.parse_reservation,
                                           method='POST',
                                           body='rm=start')
        yield response.request.replace(callback=self.parse_reservation)

    def parse_reservation(self, response):
        show = response.meta['show']
        remainings = [
            s.css('::attr(id)').extract_first()
            for s in response.css('#view_seat td[value="0"]')
        ]
        reserveds = [
            s.css('::attr(id)').extract_first()
            for s in response.css('#view_seat td[value="1"]')
        ]
        show['remaining_seats_num'] = len(remainings)
        show['total_seats_num'] = len(remainings) + len(reserveds)
        show['reserved_seats'] = reserveds
        show['remaining_seats'] = remainings
        yield show
import datetime
from dateutil.parser import parse
from urllib.parse import unquote
from get_mongo_client import get_mongo_client

cli = get_mongo_client()
c = cli.kinpri_goods_wiki.items

for i in c.find({'date': {'$gte': parse('2015-1-1')}}).sort('date'):
    if i['date_extra']:
        date = i['date'].strftime('%m月{}旬'.format(i['date_extra']))
    else:
        date = i['date'].strftime('%m月%d日')
    name = i['name'].replace('KING OF PRISM by PrettyRhythm',
                             '').replace('KING OF PRISM', '').strip()
    page = unquote(i['url'].split('/')[-1], encoding='euc-jp')
    print('| {} | [[{}>{}]] |  |'.format(date, name, page))
import datetime
from dateutil.parser import parse
from urllib.parse import unquote
from get_mongo_client import get_mongo_client

cli = get_mongo_client()
c = cli.kinpri_goods_wiki.items

for i in c.find({'date': {'$gte': parse('2015-1-1')}}).sort('date'):
    if i['date_extra']:
        date = i['date'].strftime('%m月{}旬'.format(i['date_extra']))
    else:
        date = i['date'].strftime('%m月%d日')
    name = i['name'].replace('KING OF PRISM by PrettyRhythm', '').replace('KING OF PRISM', '').strip()
    page = unquote(i['url'].split('/')[-1], encoding='euc-jp')
    print('| {} | [[{}>{}]] |  |'.format(date, name, page))