Beispiel #1
0
    def test_check_url(self):
        for url in self.valid_urls:
            self.assertTrue(utils.check_url(url),
                            "check valid urls as understood pass")

        for url in self.invalid_urls:
            self.assertFalse(utils.check_url(url), "check invalid urls caught")
Beispiel #2
0
def create_reference_detail(request, **kwargs):
    if request.method == 'GET' and 'url' in request.GET:
        try:
            url, title = check_url(request.GET['url'])
            kwargs.setdefault('extra_context', {}).update({'parsed_url': url, 'parsed_title': title,})
        except IOError, e:
            messages.error(request, _(e), fail_silently=True)
Beispiel #3
0
def check_arg_input(parser, args):
    if not args.input:
        parser.error("-i, --input INPUT is required.")

    loader = Loader.get_loader(args.input)
    if loader == FSLoader:
        if os.path.isfile(
                args.input) and not is_a_supported_image_file_extension(
                    args.input):
            parser.error("Input {} file not supported format.".format(
                args.input))
        if os.path.isfile(args.input):
            check_image_file_validity(args.input)
    elif loader == HTTPLoader:
        if not check_url(args.input):
            parser.error(
                "Url {} of the http ressource doesn't exist or is not accesible."
                .format(args.input))
        if not is_a_supported_image_file_extension(args.input):
            parser.error(
                "Url {} is not file with a supported extension format.".format(
                    args.input))
    else:
        parser.error(
            "Input {} is not a valid file or directory or url.".format(
                args.input))
    return args.input
Beispiel #4
0
    def POST(self, userurlkey=None):
        url=web.data()
        if len(url) > config.MAX_LEN_URL:
            web.ctx.status="400 Bad request"
            return "url too long"
        if userurlkey is not None:
            print userurlkey
            if len(userurlkey) < config.MIN_LEN_USERURLKEY:
                web.ctx.status="400 Bad request"
                return "key too short"
            if len(userurlkey) > config.MAX_LEN_URLKEY:
                web.ctx.status="400 Bad request"
                return "key too long"
            userurlkey = str.lower(utils.encode_string(userurlkey))
            
        url=utils.encode_string(url)
        if False == utils.check_url(url):
            web.ctx.status="400 Bad request"
            return "bad url"
            
        ret, n_affected, urlkey = model.url_new(url, userurlkey)
        if ret != 0:
            return web.internalerror("db error")

        retval = {
            "is_created": (n_affected==1 or True and False),
            "key": urlkey
        }
        web.ctx.status="200 OK"
        return json.dumps(retval)
Beispiel #5
0
def web_shorten(url):

    url = url.strip()

    if len(url) < 2 or utils.check_url(url) == False:
        return no_url()

    conn = utils.create_connection("test.db")

    check = utils.check_entry(url, conn)

    db_url = check[1] if check else False

    if db_url and db_url == url:
        conn.close()
        return already_used()

    shortcode = utils.make_key(6)

    _date = utils.get_date()

    utils.new_entry(url, shortcode, _date, _date, conn)
    conn.close()

    return shortcode
Beispiel #6
0
def shorten():

    shortcode = ""

    if request.method == 'POST':
        received = request.get_json(force=True)

        url = received["url"] if received["url"] else ""

        if len(url) < 2 or utils.check_url(url) == False:
            return no_url()

        conn = utils.create_connection("test.db")

        check = utils.check_entry(url, conn)
        db_url = check[1] if check else False

        if db_url and db_url == url:
            conn.close()
            return already_used()

        try:
            shortcode = received["shortcode"]
        except KeyError:
            logging.warn("No shortcode provided, generating one...")
            shortcode = utils.make_key(6)

        if utils.check_shortcode(shortcode) == False:
            conn.close()
            return invalid_code()

    _date = utils.get_date()
    utils.new_entry(url, shortcode, _date, _date, conn)
    conn.close()
    return flask.make_response(shortcode, 201)
Beispiel #7
0
def shorten_url():
    #  Takes url encoded in json from the body and returns a shorted url if submitted is valid returns error otherwise
    if request.is_json:
        url = request.get_json()["url"]
        if utils.check_url(utils.convert(url)):
            return jsonify({
                "shortened_url":
                '{}{}'.format(HOST, dal.add_url(url))
            }), status.HTTP_201_CREATED
    return 'Malformed URL: {}'.format(url), status.HTTP_400_BAD_REQUEST
Beispiel #8
0
 def check_url(self, url):
     valid, code = utils.check_url(url)
     if not valid:
         logger.warning('slice ' + str(self.slice) + ' partition_num ' + str(self.partition_num) + ' failed to get url '+ url)
         to_add_url = self.default_url
         change = True
     elif code == 200:
         logger.info('slice ' + str(self.slice) + ' partition_num ' + str(self.partition_num) + ' code ' + str(code) + ' ' + url)
         to_add_url = url
         change = False
     else:
         logger.waring('slice ' + str(self.slice) + ' partition_num ' + str(self.partition_num) + 'failed to get url ' + url + ' retcode ' + str(code))
         to_add_url = self.default_url
         change = True
     return to_add_url, change
Beispiel #9
0
def redirect_url(code):
    #  Takes variable from url and attempts to decode and check DB for a match.  If a match found or the original is a
    #  none coded url it will attempt redirect.  If not it will error.
    if utils.check_url(utils.convert(code)):
        return redirect(utils.convert(code), code=302)
    else:
        try:
            result = dal.lookup_url(code)
            if result:
                return redirect(utils.convert(result), code=302)
            else:
                return 'Malformed URL: {}{}'.format(
                    HOST, code), status.HTTP_400_BAD_REQUEST
        except:
            return 'Malformed URL: {}{}'.format(
                HOST, code), status.HTTP_400_BAD_REQUEST
Beispiel #10
0
    def POST(self, urlkey):
        url=web.data()
        if len(url) > config.MAX_LEN_URL:
            web.ctx.status="400 Bad request"
            return "url too long"
        if urlkey is not None:
            if len(urlkey) > config.MAX_LEN_URLKEY:
                web.ctx.status="400 Bad request"
                return "key too long"
            urlkey = str.lower(utils.encode_string(urlkey))
            
        url=utils.encode_string(url)
        if False == utils.check_url(url):
            web.ctx.status="400 Bad request"
            return "bad url"
            
        ret, n_affected = model.url_modify(urlkey, url)
        if ret != 0:
            return web.internalerror("db error")

        ret_val={ 'n_affected': n_affected}
        return json.dumps(ret_val)
Beispiel #11
0
def download_data_from_url(url, task_id, base_url=None, depth=1):
    folder = get_folder(task_id)
    if base_url is None:
        base_url = re.search(r'((https|http)://[\w_\-.]+)', url)
        if not base_url:
            raise requests.exceptions.InvalidURL(
                f"This is not a valid URL: {url}")
        base_url = base_url.group(1)

    response = requests.get(url)

    http_encoding = response.encoding if 'charset' in response.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(response.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(response.content, from_encoding=encoding)

    with open(folder + 'index.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    download_media(soup, folder, base_url)
    download_js(soup, folder, base_url)
    download_css(soup, folder, base_url)

    if depth > 0:
        links = set(
            map(lambda x: transform_url(x, base_url), find_another_urls(soup)))
        for i, link in enumerate(
                filter(lambda x: check_url(x, base_url), links)):
            try:
                download_data_from_url(link,
                                       "{0}/{1}".format(task_id, i),
                                       base_url=base_url,
                                       depth=depth - 1)
            except requests.exceptions.RequestException as e:
                logging.error(
                    f'Exception occurred while request to {url}\n {e}')
    return folder
def process_round_data(data):
	j = json.loads(data) 
	items = j['dataFeederResponse']['req2']['row']
	for item in items:
		try:
			date = datetime.datetime.strptime(item['LocalStartTime'], "%d/%m/%Y %I:%M:%S %p")
			year = int(date.year)
			short_year = date.strftime('%y')
			round = int(item['RoundName'].split(" ")[-1])
			round_pad = "%02d" % round
			round_string = "RD%s" % round_pad
			round_short = round_string
			round_name = item['RoundName']
			home_id = item['HomeTeamReference'].lower()
			away_id = item['AwayTeamReference'].lower()
			afl_id = int(item['Id'])
			final = False

			# Set the game type
			if item['SeasonName'].find('Final') > -1:
				type = 'final'
				round_string = "FinalsW%d" % round
				round_short = "FW%d" % round
				final = True
			else:
				type = 'premiership'

			# Set the teams
			for team in static.TEAMS:
				if team['id'] == home_id:
					home_val = team['val']
					home_name = team['name']
				if team['id'] == away_id:
					away_val = team['val']
					away_name = team['name']

			# Create a new match
			match = get_or_new_match(afl_id)
			match.hometeam = home_id
			match.awayteam = away_id
			match.date = date
			match.round = round
			match.type = type
			match.afl_id = afl_id
			match.thumbnail = '/img/thumb-match.jpg'
		
			match_videos = []

			if year < 2011 and round < 10 and not final:

				video_url_med = "http://pd.streaming.telstra.com/pd_afl0/OnDemand/%d/ON/iVideo/Premiership/%s/NV_%s_%sV%s_1M.mp4" % (year, round_string, round_string.title(), home_val, away_val)
				if utils.check_url(video_url_med) != 404:
					video = get_or_new_video(video_url_med)
					video.name = "%s %s %s v %s" % (round_string.title(), year, home_name, away_name)
					video.thumbnail = "%sthumb/match-replay.jpg" % (settings.MEDIA_URL) 
					video.date = date
					video.urls = []

					# Test for low quality (172k stream)
					video_low_qual = re.sub("1[mM][bB]{,1}.mp4", "172K.mp4", video_url_med)
					if utils.check_url(video_low_qual) != 404:
						print("Found low-res video for %s" % video.name)
						video.urls.insert(static.QUAL_LOW, video_low_qual)
					else:
						video.urls.insert(static.QUAL_LOW, None)

					# Just blindly insert the medium quality stream
					video.urls.insert(static.QUAL_MED, video_url_med)

					# Test for high quality (2Mb stream)
					video_high_qual = re.sub("1[mM][bB]{,1}.mp4", "2M.mp4", video_url_med)
					if utils.check_url(video_high_qual) != 404:
						print("Found high-res video for %s" % video.name)
						video.urls.insert(static.QUAL_HIGH, video_high_qual)
					else:
						video.urls.insert(static.QUAL_HIGH, None)

					video = utils.tag_video(video, 'replay',)
					print("Saving video: %s" % video)
					video.save()

					match_videos.append(video.pk)

			else:
				for i, qtr in enumerate(['1st','2nd','3rd','4th']):
					video_url_med = "http://bptvpd.ngcdn.telstra.com/pd_afl0/OnDemand/%d/ON/iVideo/Premiership/%s/AFL%s_%s_%s_vs_%s_%s_qr_full_1M.mp4" % (year, round_string, short_year, round_short.lower(), home_id, away_id, qtr)

					if utils.check_url(video_url_med) != 404:
						video = get_or_new_video(video_url_med)

						if final:
							video.name = "%s %s %s v %s (%s Qtr)" % (round_name, year, home_name, away_name, qtr)
						else:
							video.name = "%s %s %s v %s (%s Qtr)" % (round_string.title(), year, home_name, away_name, qtr)

						video.thumbnail = "%sthumb/match-replay-%s-qtr.jpg" % (settings.MEDIA_URL, qtr)
						video.date = date
						video.urls = []

						# Test for low quality (172k stream)
						video_low_qual = re.sub("1[mM][bB]{,1}.mp4", "172K.mp4", video_url_med)
						if utils.check_url(video_low_qual) != 404:
							print("Found low-res video for %s" % video.name)
							video.urls.insert(static.QUAL_LOW, video_low_qual)

						# Just blindly insert the medium quality stream
						video.urls.insert(static.QUAL_MED, video_url_med)

						# Test for high quality (2Mb stream)
						video_high_qual = re.sub("1[mM][bB]{,1}.mp4", "2M.mp4", video_url_med)
						if utils.check_url(video_high_qual) != 404:
							print("Found high-res video for %s" % video.name)
							video.urls.insert(static.QUAL_HIGH, video_high_qual)

						video = utils.tag_video(video, 'replay')
						print("Saving video: %s" % video)
						video.save()

						match_videos.append(video.pk)

			match.videos = match_videos
			print("Saving match: %s" % match.get_title())
			r = match.save()

		except Exception, e:
			logging.exception("Failed to parse match")
Beispiel #13
0
    def test_check_url(self):
        url = 'http://workflow.isi.edu/MINT/FLDAS/FLDAS_NOAH01_A_EA_D.001/2019/04/FLDAS_NOAH01_A_EA_D.A20190401.001.nc'
        print('url: ', url)

        result = utils.check_url(url)
        self.assertTrue(result)
def download_pitch_data_only(args, lm=None):
    # return True or False
    pdata_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/pitches.nhn'
    pdata_header_row = [
        'x0', 'y0', 'z0', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'plateX',
        'plateZ', 'crossPlateX', 'crossPlateY', 'topSz', 'bottomSz', 'stuff',
        'speed', 'pitcherName', 'batterName'
    ]

    game_ids = get_game_ids(args)
    if (game_ids is None) or (len(game_ids) == 0):
        print('no game ids')
        print('args: {}'.format(args))
        if lm is not None:
            lm.log('no game ids')
            lm.log('args: {}'.format(args))
        return False

    if lm is not None:
        lm.resetLogHandler()
        lm.setLogPath(os.getcwd())
        lm.setLogFileName('pitch_data_download_log.txt')
        lm.cleanLog()
        lm.createLogHandler()
        lm.log('---- Pitch Data Download Log ----')

    if not os.path.isdir('pbp_data'):
        os.mkdir('pbp_data')
    os.chdir('pbp_data')
    # path: pbp_data

    print("##################################################")
    print("######         DOWNLOAD PITCH DATA         #######")
    print("##################################################")

    for year in game_ids.keys():
        start1 = time.time()
        print(" Year {}".format(year))
        if len(game_ids[year]) == 0:
            print('month id is empty')
            print('args: {}'.format(args))
            if lm is not None:
                lm.log('month id is empty')
                lm.log('args : {}'.format(args))
            return False

        if not os.path.isdir(str(year)):
            os.mkdir(str(year))
        os.chdir(str(year))
        # path: pbp_data/year

        year_fp = open(f'{year}_pdata.csv', 'w', newline='\n')
        year_cf = csv.writer(year_fp)
        year_cf.writerow(pdata_header_row)

        for month in game_ids[year].keys():
            start2 = time.time()
            print("  Month {}".format(month))
            if len(game_ids[year][month]) == 0:
                print('month id is empty')
                print('args: {}'.format(args))
                if lm is not None:
                    lm.log('month id is empty')
                    lm.log('args : {}'.format(args))
                return False

            if not os.path.isdir(str(month)):
                os.mkdir(str(month))
            os.chdir(str(month))
            # path: pbp_data/year/month

            month_fp = open(f'{year}_{month}_pdata.csv', 'w', newline='\n')
            month_cf = csv.writer(month_fp)
            month_cf.writerow(pdata_header_row)

            # download
            done = 0
            skipped = 0
            for game_id in game_ids[year][month]:
                if (int(game_id[:4]) < 2008) or (int(game_id[:4]) >
                                                 datetime.datetime.now().year):
                    skipped += 1
                    continue
                if (int(game_id[:4]) == datetime.datetime.now().year) and (int(
                        game_id[4:8]) > int(
                            datetime.datetime.now().date().strftime('%m%d'))):
                    skipped += 1
                    continue
                if int(game_id[4:8]) < int(regular_start[game_id[:4]]):
                    skipped += 1
                    continue
                if int(game_id[4:8]) >= int(playoff_start[game_id[:4]]):
                    skipped += 1
                    continue
                if game_id[8:10] not in teams:
                    skipped += 1
                    continue

                if not check_url(pdata_url):
                    skipped += 1
                    if lm is not None:
                        lm.log('URL error : {}'.format(pdata_url))
                    continue

                if (int(game_id[:4]) == datetime.datetime.now().year) &\
                   (int(game_id[4:6]) == datetime.datetime.now().month) &\
                   (int(game_id[6:8]) == datetime.datetime.now().day):
                    # do nothing
                    done = done
                elif (os.path.isfile(game_id + '_pdata.json')) and \
                        (os.path.getsize(game_id + '_pdata.json') > 0):
                    done += 1
                    if lm is not None:
                        lm.log('File Duplicate : {}'.format(game_id))
                    continue

                params = {'gameId': game_id}

                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/59.0.3071.115 Safari/537.36',
                    'X-Requested-With':
                    'XMLHttpRequest',
                    'Host':
                    'm.sports.naver.com',
                    'Referer':
                    'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?&gameId='
                    + game_id + '&tab=relay'
                }

                response = requests.get(pdata_url,
                                        params=params,
                                        headers=headers)

                if response is not None:
                    # load json structure
                    js = response.json()
                    if isinstance(js, str):
                        js = json.loads(js)
                        #js = ast.literal_eval(js)

                    if js is None:
                        lm.log('Pitch data missing : {}'.format(game_id))
                        skipped += 1
                        continue
                    elif len(js) == 0:
                        lm.log('Pitch data missing : {}'.format(game_id))
                        skipped += 1
                        continue

                    # json to pandas dataframe
                    #df = pd.read_json(json.dumps(js))
                    df = pd.DataFrame(js)

                    # calculate pitch location(px, pz)
                    t = -df['vy0'] - np.sqrt(df['vy0'] * df['vy0'] -
                                             2 * df['ay'] *
                                             (df['y0'] - df['crossPlateY']))
                    t /= df['ay']
                    xp = df['x0'] + df['vx0'] * t + df['ax'] * t * t * 0.5
                    zp = df['z0'] + df['vz0'] * t + df['az'] * t * t * 0.5
                    df['plateX'] = np.round(xp, 5)
                    df['plateZ'] = np.round(zp, 5)

                    # calculate pitch movement(pfx_x, pfx_z)
                    t40 = -df['vy0'] - np.sqrt(df['vy0'] * df['vy0'] -
                                               2 * df['ay'] * (df['y0'] - 40))
                    t40 /= df['ay']
                    x40 = df[
                        'x0'] + df['vx0'] * t40 + 0.5 * df['ax'] * t40 * t40
                    vx40 = df['vx0'] + df['ax'] * t40
                    z40 = df[
                        'z0'] + df['vz0'] * t40 + 0.5 * df['az'] * t40 * t40
                    vz40 = df['vz0'] + df['az'] * t40
                    th = t - t40
                    x_no_air = x40 + vx40 * th
                    z_no_air = z40 + vz40 * th - 0.5 * 32.174 * th * th
                    df['pfx_x'] = np.round((xp - x_no_air) * 12, 5)
                    df['pfx_z'] = np.round((zp - z_no_air) * 12, 5)

                    # load back to json structure
                    dfjsstr = df.to_json(orient='records', force_ascii=False)
                    dfjs = json.loads(dfjsstr)

                    # dump to json file
                    fp = open(game_id + '_pdata.json', 'w', newline='\n')
                    json.dump(dfjs,
                              fp,
                              ensure_ascii=False,
                              sort_keys=False,
                              indent=4)
                    fp.close()

                    # dump to csv file
                    fp = open(game_id + '_pdata.csv', 'w', newline='\n')
                    cf = csv.writer(fp)
                    cf.writerow(pdata_header_row)

                    for x in dfjs:
                        row = [
                            x['x0'], x['y0'], x['z0'], x['vx0'], x['vy0'],
                            x['vz0'], x['ax'], x['ay'], x['az'], x['plateX'],
                            x['plateZ'], x['crossPlateX'], x['crossPlateY'],
                            x['topSz'], x['bottomSz'], x['stuff'], x['speed'],
                            x['pitcherName'], x['batterName']
                        ]
                        month_cf.writerow(row)
                        year_cf.writerow(row)
                        cf.writerow(row)

                    fp.close()

                    done += 1
                else:
                    skipped += 1
                    if lm is not None:
                        lm.log('Cannot get response : {}'.format(game_id))

                print_progress('    Downloading: ', len(game_ids[year][month]),
                               done, skipped)

            # download done
            print_progress('    Downloading: ', len(game_ids[year][month]),
                           done, skipped)
            print('\n        Downloaded {} files'.format(done))
            print('        (Skipped {} files)'.format(skipped))
            end2 = time.time()
            print('            -- elapsed {:.3f} sec for month {}'.format(
                end2 - start2, month))
            month_fp.close()

            os.chdir('..')
            # path: pbp_data/year
        end1 = time.time()
        print('   -- elapsed {:.3f} sec for year {}'.format(
            end1 - start1, year))
        # months done
        year_fp.close()

        os.chdir('..')

        # path: pbp_data/
    # years done
    os.chdir('..')
    # path: root
    return True
def download_relay(args, lm=None):
    # return True or False
    relay_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/relayText.nhn'
    record_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/record.nhn'

    game_ids = get_game_ids(args)
    if (game_ids is None) or (len(game_ids) == 0):
        print('no game ids')
        print('args: {}'.format(args))
        if lm is not None:
            lm.log('no game ids')
            lm.log('args: {}'.format(args))
        return False

    if lm is not None:
        lm.resetLogHandler()
        lm.setLogPath(os.getcwd())
        lm.setLogFileName('relay_download_log.txt')
        lm.cleanLog()
        lm.createLogHandler()
        lm.log('---- Relay Text Download Log ----')

    if not os.path.isdir('pbp_data'):
        os.mkdir('pbp_data')
    os.chdir('pbp_data')
    # path: pbp_data

    print("##################################################")
    print("######        DOWNLOAD RELAY DATA          #######")
    print("##################################################")

    for year in game_ids.keys():
        start1 = time.time()
        print(" Year {}".format(year))
        if len(game_ids[year]) == 0:
            print('month id is empty')
            print('args: {}'.format(args))
            if lm is not None:
                lm.log('month id is empty')
                lm.log('args : {}'.format(args))
            return False

        if not os.path.isdir(str(year)):
            os.mkdir(str(year))
        os.chdir(str(year))
        # path: pbp_data/year

        for month in game_ids[year].keys():
            start2 = time.time()
            print("  Month {}".format(month))
            if len(game_ids[year][month]) == 0:
                print('month id is empty')
                print('args: {}'.format(args))
                if lm is not None:
                    lm.log('month id is empty')
                    lm.log('args : {}'.format(args))
                return False

            if not os.path.isdir(str(month)):
                os.mkdir(str(month))
            os.chdir(str(month))
            # path: pbp_data/year/month

            # download
            done = 0
            skipped = 0
            for game_id in game_ids[year][month]:
                if (int(game_id[:4]) < 2008) or (int(game_id[:4]) > 7777):
                    skipped += 1
                    continue
                if (int(game_id[:4]) == datetime.datetime.now().year) and (int(
                        game_id[4:8]) > int(
                            datetime.datetime.now().date().strftime('%m%d'))):
                    skipped += 1
                    continue
                if int(game_id[4:8]) < int(regular_start[game_id[:4]]):
                    skipped += 1
                    continue
                if int(game_id[4:8]) >= int(playoff_start[game_id[:4]]):
                    skipped += 1
                    continue
                if game_id[8:10] not in teams:
                    skipped += 1
                    continue

                if not check_url(relay_url):
                    skipped += 1
                    if lm is not None:
                        lm.log('URL error : {}'.format(relay_url))
                    continue

                if (int(game_id[:4]) == datetime.datetime.now().year) &\
                   (int(game_id[4:6]) == datetime.datetime.now().month) &\
                   (int(game_id[6:8]) == datetime.datetime.now().day):
                    # do nothing
                    done = done
                elif (os.path.isfile(game_id + '_relay.json')) and \
                        (os.path.getsize(game_id + '_relay.json') > 0):
                    done += 1
                    if lm is not None:
                        lm.log('File Duplicate : {}'.format(game_id))
                    continue

                params = {'gameId': game_id, 'half': '1'}

                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/59.0.3071.115 Safari/537.36',
                    'X-Requested-With':
                    'XMLHttpRequest',
                    'Host':
                    'm.sports.naver.com',
                    'Referer':
                    'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?&gameId='
                    + game_id + '&tab=relay'
                }

                response = requests.get(relay_url,
                                        params=params,
                                        headers=headers)

                if response is not None:
                    txt = {}
                    js = response.json()
                    if isinstance(js, str):
                        js = json.loads(js)
                    last_inning = js['currentInning']

                    if last_inning is None:
                        skipped += 1
                        lm.log('Gameday not found : {}'.format(game_id))
                        continue

                    txt['relayList'] = {}
                    for i in range(len(js['relayList'])):
                        txt['relayList'][js['relayList'][i]
                                         ['no']] = js['relayList'][i]
                    txt['homeTeamLineUp'] = js['homeTeamLineUp']
                    txt['awayTeamLineUp'] = js['awayTeamLineUp']

                    txt['stadium'] = js['schedule']['stadium']

                    response.close()

                    for inn in range(2, last_inning + 1):
                        params = {'gameId': game_id, 'half': str(inn)}

                        response = requests.get(relay_url,
                                                params=params,
                                                headers=headers)
                        if response is not None:
                            js = response.json()
                            if isinstance(js, str):
                                js = json.loads(js)
                                #js = ast.literal_eval(js)

                            # BUGBUG
                            # 문자중계 텍스트에 비 unicode 문자가 들어간 경우.
                            # gameid : 20180717LGWO02018
                            # 문제가 되는 텍스트: \ufffd (REPLACEMENT CHARACTER) - cp949로 저장 불가
                            # 해결책: cp949로 encoding 불가능한 문자가 있을 때는 blank text로 교체.
                            for i in range(len(js['relayList'])):
                                txt['relayList'][js['relayList'][i]
                                                 ['no']] = js['relayList'][i]
                                texts = txt['relayList'][
                                    js['relayList'][i]['no']]['textOptionList']
                                for i in range(len(texts)):
                                    try:
                                        texts[i]['text'].encode('cp949')
                                    except UnicodeEncodeError:
                                        texts[i]['text'] = ''
                        else:
                            skipped += 1
                            if lm is not None:
                                lm.log(
                                    'Cannot get response : {}'.format(game_id))

                        response.close()

                    # get referee
                    params = {'gameId': game_id}

                    headers = {
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
                        'like Gecko) Chrome/59.0.3071.115 Safari/537.36',
                        'X-Requested-With':
                        'XMLHttpRequest',
                        'Host':
                        'm.sports.naver.com',
                        'Referer':
                        'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?gameId='
                        + game_id + '&tab=record'
                    }

                    response = requests.get(record_url,
                                            params=params,
                                            headers=headers)

                    p = regex.compile(
                        '(?<=\"etcRecords\":\[)[\\\.\{\}\"0-9:\s\(\)\,\ba-z가-힣\{\}]+'
                    )
                    result = p.findall(response.text)
                    if len(result) == 0:
                        txt['referee'] = ''
                    else:
                        txt['referee'] = result[0].split('{')[-1].split(
                            '":"')[1].split(' ')[0]
                    '''
                    p = regex.compile('stadiumName: \'\w+\'')
                    result = p.findall(response.text)
                    if len(result) == 0:
                        txt['stadium'] = ''
                    else:
                        txt['stadium'] = result[0].split('\'')[1]
                    '''

                    response.close()

                    fp = open(game_id + '_relay.json', 'w', newline='\n')
                    json.dump(txt,
                              fp,
                              ensure_ascii=False,
                              sort_keys=False,
                              indent=4)
                    fp.close()

                    ##### 텍스트만 저장
                    text_list = []
                    pts_list = []
                    text_list_header = [
                        "textOrder", "textType", "text", "ptsPitchId", "stuff",
                        "speed"
                    ]
                    pts_list_header = [
                        "textOrder", "inn", "ballcount", "crossPlateX",
                        "topSz", "crossPlateY", "pitchId", "vy0", "vz0", "vx0",
                        "z0", "y0", "ax", "x0", "ay", "az", "bottomSz",
                        "stance"
                    ]
                    for k in sorted(txt['relayList'].keys()):
                        textset = txt['relayList'][k]
                        textOptionList = textset['textOptionList']
                        for to in textOptionList:
                            row = [k, to['type'], to['text']]
                            if 'ptsPitchId' in to.keys():
                                row.append(to['ptsPitchId'])
                            else:
                                row.append('')
                            if 'stuff' in to.keys():
                                row.append(to['stuff'])
                            else:
                                row.append('')
                            if 'speed' in to.keys():
                                row.append(to['speed'])
                            else:
                                row.append('')
                            text_list.append(row)
                        if 'ptsOptionList' in textset.keys():
                            ptsOptionList = textset['ptsOptionList']
                            for po in ptsOptionList:
                                row = [k] + list(po.values())
                                pts_list.append(row)

                    fp = open(game_id + '_textset.csv', 'w', newline='\n')
                    cf = csv.writer(fp)
                    cf.writerow(text_list_header)
                    for tl in text_list:
                        cf.writerow(tl)
                    fp.close()

                    fp = open(game_id + '_ptsset.csv', 'w', newline='\n')
                    cf = csv.writer(fp)
                    cf.writerow(pts_list_header)
                    for pl in pts_list:
                        cf.writerow(pl)
                    fp.close()
                    #####

                    done += 1
                else:
                    skipped += 1
                    if lm is not None:
                        lm.log('Cannot get response : {}'.format(game_id))

                print_progress('    Downloading: ', len(game_ids[year][month]),
                               done, skipped)

            # download done
            print_progress('    Downloading: ', len(game_ids[year][month]),
                           done, skipped)
            print('\n        Downloaded {} files'.format(done))
            print('        (Skipped {} files)'.format(skipped))
            end2 = time.time()
            print('            -- elapsed {:.3f} sec for month {}'.format(
                end2 - start2, month))

            os.chdir('..')
            # path: pbp_data/year
        end1 = time.time()
        print('   -- elapsed {:.3f} sec for year {}'.format(
            end1 - start1, year))
        # months done
        os.chdir('..')
        # path: pbp_data/
    # years done
    os.chdir('..')
    # path: root
    return True
Beispiel #16
0
        def callback():
            s, p = ndb.get_multi([skey, pkey])
            sc, pc = False, False

            if not s or not p or p.key.parent() != s.key:
                return

            for k in keys:
                v = self.request.POST.get('_%s' % k, None)

                if v is None or v == getattr(s, k):
                    continue

                if v and k in sm and not utils.check_url(sm[k]['url'] + v):
                    r['errors'].append("%s URL doesn't seem to be working." %
                                       sm[k]['name'])
                    continue

                setattr(s, k, v)
                sc = True

            if set_domain:
                s.domain = domain
                sc = True

            pos = self.request.POST.get('pos')
            if pos:
                pages = []
                for p_pos in pos.split(','):
                    if not p_pos.startswith('p_'):
                        continue
                    pages.append(ndb.Key('Page', long(p_pos[2:]), parent=skey))
                s.pages = pages
                sc = True

            if sc:
                s.put_async()

            spec = p.spec()

            for i in range(spec.get('links', 0)):
                k = '_link_%i' % i
                if k in self.request.POST:
                    p.links[i] = self.request.POST[k]
                    pc = True

            for i in range(spec.get('text', 0)):
                k = '_text_%i' % i
                if k in self.request.POST:
                    p.text[i] = self.request.POST[k]
                    pc = True

            for i in range(spec.get('lines', 0)):
                k = '_line_%i' % i
                if k in self.request.POST:
                    p.lines[i] = self.request.POST[k]
                    pc = True

            for i in range(spec.get('maps', 0)):
                k = '_map_%i' % i
                if k in self.request.POST:
                    p.maps[i] = self.request.POST[k]
                    pc = True

            cm = self.request.POST.get('p_%s_name' % p.key.id())
            if cm:
                errors = models.Page.pagename_isvalid(s, cm)
                if cm.lower() != p.name_lower and errors:
                    r['errors'].append(errors)
                else:
                    p.name = cm
                    pc = True
            elif 'current_menu' in self.request.POST:
                r['errors'].append('Page names may not be blank')

            if 'gal' in self.request.POST and p.type == models.PAGE_TYPE_GALLERY:
                p.images = []
                for i in self.request.POST.get('gal').split(','):
                    if not i:
                        continue
                    imgid = long(i.partition('_')[2])
                    p.images.append(ndb.Key('ImageBlob', imgid, parent=skey))
                pc = True

            if p.type == models.PAGE_TYPE_BLOG:
                # what if we have multiple puts on the same entity here? race condition?
                for k in self.request.POST.keys():
                    value = None

                    if k.startswith('_posttitle_'):
                        name = 'title'
                    elif k.startswith('_posttext_'):
                        name = 'text'
                    elif k.startswith('_postauthor_'):
                        name = 'author'
                    elif k.startswith('_postdate_'):
                        name = 'date'
                        d = self.request.POST[k].split('-')
                        value = datetime.datetime(int(d[0]),
                                                  int(d[1]) + 1, int(d[2]))
                    elif k.startswith('_postdraft_'):
                        name = 'draft'
                        value = self.request.POST[k] == 'true'
                    else:
                        continue

                    bpid = long(k.rpartition('_')[2])
                    bp = models.BlogPost.get_by_id(bpid, parent=p.key)

                    if value is None:
                        value = self.request.POST[k]

                    setattr(bp, name, value)
                    bp.put_async()

            if pc:
                p.put_async()

            return [s, p]