def get_week_number_as_two_digits_string(in_date, in_start_iso_weekday=1): if in_start_iso_weekday == 7: in_date = in_date + timedelta(days=1) week_iso_num = datetime.isocalendar(in_date)[1] iso_wek_to_return = str(week_iso_num) if week_iso_num < 10: iso_wek_to_return = '0' + iso_wek_to_return return str(datetime.isocalendar(in_date)[0]), iso_wek_to_return
def parse_mint_date(date_str): current_year = datetime.isocalendar(date.today())[0] try: new_date = datetime.strptime(date_str + str(current_year), '%b %d%Y') except ValueError: new_date = datetime.strptime(date_str, '%m/%d/%y') return new_date.date()
def get_tommorow(message): """ Получить расписание на следующий день """ # PUT YOUR CODE HERE _, group, week = message.text.split() day = datetime.today() week = datetime.isocalendar(day) day = datetime.isoweekday(day) + 1 week = (week[1] + 1) % 2 + 1 web_page = get_page(group, week) if day == 8: day = 1 test = parse_schedule(web_page, day) if not test: resp = 'Занятий нет' else: times_list, locations_list, classrooms_list, lessons_list = test resp = '' for time, location, classroom, lesson in zip(times_list, locations_list, classrooms_list, lessons_list): resp += '<b>{}</b>, {}, {}, {}\n'.format(time, location, classroom, lesson) bot.send_message(message.chat.id, resp, parse_mode='HTML')
def __setitem__(self, index, items): '''Sobrecarga del operador [] para cambiar el dato de una posición dada.''' self.config.set(index, items[0], items[1]) self.config["UPDATED"] = '"%s/%s/%s"'%datetime.isocalendar(datetime.now()) with open(self.name, 'w') as f: strjson = json.dumps( self.config, sort_keys=True,indent=4, separators=(',', ': ') ) f.write(strjson)
def process_log_data(spark, input_data, output_data): print("--- Starting Process Log_Data ---") # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "NextSong") # extract columns for users table # user_id, first_name, last_name, gender, level users_table = df['userId', 'firstName', 'lastName', 'gender', 'level', 'ts'] users_table = users_table.orderBy( "ts", ascending=False).dropDuplicates(subset=["userId"]).drop('ts') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x) / 1000)), TimestampType()) get_weekday = udf(lambda x: x.weekday()) get_week = udf(lambda x: datetime.isocalendar(x)[1]) get_hour = udf(lambda x: x.hour) get_day = udf(lambda x: x.day) get_year = udf(lambda x: x.year) get_month = udf(lambda x: x.month) df = df.withColumn('start_time', get_datetime(df.ts)) df = df.withColumn('hour', get_hour(df.start_time)) df = df.withColumn('day', get_day(df.start_time)) df = df.withColumn('week', get_week(df.start_time)) df = df.withColumn('month', get_month(df.start_time)) df = df.withColumn('year', get_year(df.start_time)) df = df.withColumn('weekday', get_weekday(df.start_time)) time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] time_table = time_table.drop_duplicates(subset=['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.parquet("output/songs.parquet") # extract columns from joined song and log datasets to create songplays table df = df.join(song_df, (song_df.title == df.song) & (song_df.artist_name == df.artist)) df = df.withColumn('songplay_id', monotonically_increasing_id()) songplays_table = df['songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'] # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays.parquet'), 'overwrite')
def json_date_to_datetime(dateraw): cy = datetime.isocalendar(date.today())[0] try: newdate = datetime.strptime(dateraw + str(cy), '%b %d%Y') except ValueError: newdate = datetime.strptime(dateraw, '%m/%d/%y') return newdate
def addMinutes(minutes, day=None, week=None, year=None): currentyear, currentweek, currentday = datetime.isocalendar(datetime.now()) if day is None: day = currentday if week is None: week = currentweek if year is None: year = currentyear data = read() data['work_hours'][year][week][day - 1] += minutes / 60 write(data)
def _dateconvert(self, dateraw): # Converts dates from json data cy = datetime.isocalendar(date.today())[0] try: newdate = datetime.strptime(dateraw + str(cy), '%b %d%Y') except: newdate = datetime.strptime(dateraw, '%m/%d/%y') return newdate
def menu_scrape(): d = datetime.isocalendar(date.today()) #this matches the format of zerocator today_match = str(d).replace(', ','-').replace('(','').replace(')','') f = soup.find_all("div", {"data-date": today_match}) for x in f: today_soup = BeautifulSoup(f) menu_info(today_soup)
def menu_scrape(): d = datetime.isocalendar(date.today()) today_match = date.today().isoformat() f = soup.find_all("div", {"data-date": today_match}) meal_num = 1 for x in f: menu_info(x.encode('utf-8'),meal_num) meal_num += 1
def _get_up_week_check(): """Возвращает 0, если верхняя неделя четная, иначе 1""" start_week = date(config.START_YEAR, config.START_MONTH, config.START_DAY) start_week_number = datetime.isocalendar(start_week)[1] if start_week_number % 2 == 0: up_week_check = 0 else: up_week_check = 1 return up_week_check
def get_current_week_type(): """Возвращает текущий тип недели.""" today = date.today() up_week_check = _get_up_week_check() current_week_number = datetime.isocalendar(today)[1] if current_week_number % 2 == 0 and up_week_check == 0: week_type = 1 else: week_type = 0 return week_type
def get_monday_sunday(date): """convert date into isocalendar to pull out date of Monday and Sunday""" iso_date = datetime.isocalendar(date) year = iso_date[0] week = iso_date[1] monday_no_time = Week(year, week).monday() monday = datetime.combine(monday_no_time, datetime.min.time()) sunday_no_time = Week(year, week).sunday() sunday = datetime.combine(sunday_no_time, datetime.min.time()) return monday, sunday
def gather_sentences(self, period): # Use for merge list element from itertools import chain if not period in self.meta_keys: # check argv print('key error (period)') sys.exit(1) # get current time now = datetime.now() week = datetime.isocalendar(now)[1] now = (str(now.year), str(now.month), str(week)) sents_of_period = [] sources = super(Ngrams, self).walk_src_path() self.a_info['total_articles'] = len(sources) # seperate path name by slash rule # sep_pn => ".*/website/year/month/week/day/.*" sep_pn = re.compile('.*/(\S+)/(\S+)/(\S+)/(\S+)/(\S+)/.*') # gather sentences for src in sources: with open(src, mode='r', encoding='UTF-8') as jf: sentences = json.load(jf)['content'] if not sentences: continue else: pop = sep_pn.findall(src)[0] # part of path if period == 'total': self.a_info['total_zi'] += len(list(chain(*sentences))) sents_of_period += sentences elif period == 'y': if pop[1] == now[0]: sents_of_period += sentences elif period == 'm': if pop[2] == now[1]: sents_of_period += sentences elif period == 'w': if pop[3] == now[2]: sents_of_period += sentences else: return [] # default is return blank list gc.collect() #print('gargabe collected') #print('gather sentences complete for period {0}'.format(period)) return sents_of_period
def test_parse_mint_date(self): current_year = datetime.isocalendar(date.today())[0] self.assertEqual(mint.parse_mint_date('Jan 10'), date(current_year, 1, 10)) self.assertEqual(mint.parse_mint_date('Nov 30'), date(current_year, 11, 30)) self.assertEqual(mint.parse_mint_date('Oct 08'), date(current_year, 10, 8)) self.assertEqual(mint.parse_mint_date('10/8/10'), date(2010, 10, 8)) self.assertEqual(mint.parse_mint_date('1/23/10'), date(2010, 1, 23)) self.assertEqual(mint.parse_mint_date('6/1/01'), date(2001, 6, 1))
def get_tomorrow(message): _, group = message.text.split() n = datetime.today() n = datetime.isocalendar(n) day_w = str(n[2]+1) + 'day' week = what_week(n[1], day_w) web_page = get_page(group, week) times_lst, locations_lst, lessons_lst, rooms_lst = get_schedule(web_page, day_w) resp = '' for time, location, lession, room in zip(times_lst, locations_lst, lessons_lst, rooms_lst): resp += '<b>{}</b>, {}, {},{}\n'.format(time, room, location, lession) bot.send_message(message.chat.id, resp, parse_mode='HTML')
def add_text(started, text): current_week = datetime.isocalendar(started)[1] week_dir = f'v{current_week}' if week_dir not in os.listdir(DATA_ROOT): os.mkdir(DATA_ROOT + week_dir) p = f"{DATA_ROOT}{week_dir}/{started.strftime('%Y-%m-%d')}.txt" f = open(p, 'a+') f.write(text+'\n') f.close()
def prepare_for_timeseries_weekly(listing): listing.sort() df = pd.DataFrame() tmp = {} year,week,day_of_week = datetime.isocalendar(listing[0]) tmp["year"] = year tmp["week"] = week tmp["count"] = 1 tmp["to_order"] = listing[0] for date in listing[1:]: year,week,day_of_week = datetime.isocalendar(date) if tmp["year"] == year and tmp["week"] == week: tmp["count"] += 1 else: df = df.append(tmp,ignore_index=True) tmp = {} tmp["year"] = year tmp["week"] = week tmp["count"] = 1 tmp["to_order"] = date df = df.drop("week",1) df = df.drop("year",1) return df
def getWeek(self, course, date): # Get week number from date ( _, week, _ ) = datetime.isocalendar(date) if week > 52: week -= 52 # Make weekid for cache identification weekid = course + str(week) # Download the individual page for that course and that week if not weekid in self.cache: self.cache[weekid] = self.downloadPage(self.week_template_url.format(course,week)) # Return the table that actually contains the schedule return self.cache[weekid].xpath('/html/body/table[2]')[0]
def __init__(self, name): try: self.name = name if not os.path.exists(name): from datetime import datetime with open(name, 'w') as f: f.write("""{\n\t"UPDATED":"%s/%s/%s","PLUGINS":{},\n\t"PERMI SSIONS":{}\n}\n'""" % datetime.isocalendar(datetime.now())) with open(name) as configFile: content = configFile.read() self.config = json.loads(content) io.log('loaded', name.split(os.sep)[-1]) except Exception as error: io.error(error)
def get_data(self): ''' Query the db for chart data, pack them into a dict and return it. ''' this_week = datetime.isocalendar(datetime.today())[1] harvest= [harvest_season for harvest_season in Season.objects.all() if datetime.isocalendar(harvest_season.expected_harvest_date)[1]== this_week] start = datetime.today() - timedelta(days=datetime.today().weekday()) end = start + timedelta(days=6) dateList =pd.date_range(start,end) dateListConvert = list(map(pd.Timestamp.to_pydatetime,dateList)) data = {} for day in dateListConvert: data[day.date()] = 0 for harvests in harvest: if harvests.expected_harvest_date in data: data[harvests.expected_harvest_date] = harvests.estimated_yield return data
def getEventsFromDate(self, course, date): # Get weeks schedule week = self.getWeek(course, date) # Get day of week ( _, _, day ) = datetime.isocalendar(date) # How many rows does the day span row_span = week.xpath('tr/td[1]//@rowspan') row_span = [int(x) for x in row_span] start_row = 2 + sum(row_span[:day-1]) row_count = row_span[day-1] xml_events = [] # Get the events from a specific day for row in range(0,row_count): # Get the events from a specific row xml_events += week.xpath('tr[' + str(start_row + row) + ']/td[table]') # Reverse so we can use pop and append from the end xml_events.reverse() # List of events events = [] while xml_events: # Pick first event event = Event(xml_events.pop(), date) if xml_events: # Look at next event next_event = Event(xml_events[-1],date) # If next event is the same course as first event and it's only a small break if next_event.course == event.course and (next_event.starttime - event.endtime) < self.smallbreak: # Combine the two events event.endtime = next_event.endtime # Remove the next event from the queue xml_events.pop() # Push event to list events.append(event) return events
def __init__(self, weeknum, classcode, buildingcode, sectorcode): self.weeknum = datetime.isocalendar(datetime.today())[1] self.classcode = "HEITO19AO-A" self.buildingcode = "HRN" self.sectorcode = "ECO" try: if weeknum is not None and isinstance(weeknum, str): self.weeknum = weeknum if classcode is not None and isinstance(classcode, str): self.classcode = classcode if buildingcode is not None and isinstance(buildingcode, str): self.buildingcode = buildingcode if sectorcode is not None and isinstance(sectorcode, str): self.sectorcode = sectorcode except: raise ClientException( "Een van de url parameters die is ingevoerd, is ongeldig")
def submit_form(): """Show and Process timetracker form""" ########################################Show Proper Form####################################### #set date old_date = datetime.now() stripped_date = datetime.date(old_date) date = datetime.combine(stripped_date, datetime.min.time()) #extract day from datetime stamp iso_week = datetime.isocalendar(date) day = iso_week[2] #set user_id to logged in user user_id = session["user_id"] #see if there is already a response with the same day and time id in db test_response = (db.session.query(Response.time_interval) .filter(Response.date == date, Response.user_id == session["user_id"]).all()) #create a list of the time intervals from above query used_times = [item[0] for item in test_response] #list comprehension omg #only display times that haven't already been filled out if request.method == 'GET': return render_template("form.html", times=TIMES, used_times=used_times) #######################################Process form############################### else: # get form variables hourint = request.form["hourint"] text = request.form["text"] color = request.form["color"] # create a new response new_response = (Response(user_id=user_id, color=color, date=date, day=day, time_interval=hourint, text=text)) # add new response to database db.session.add(new_response) db.session.commit() return redirect("/chart?times="+",".join(str(x) for x in TIMES))
def main(): currentTime = datetime.now() year, week, day = datetime.isocalendar(currentTime) with open(fp, 'rb+') as f: data = pickle.load(f) if data['work_hours'].setdefault(year, {}).setdefault( week, [0, 0, 0, 0, 0, 0, 0 ])[day - 1] == 0 and data['tracking_start'] is None: # hack data['day_start'] = currentTime data['timestamps'] = data['timestamps'][-20:] if data['tracking_start'] is None: data['tracking_start'] = currentTime print('Started Tracking..') else: currentWork = (currentTime - data['tracking_start']).total_seconds() / 3600 data['work_hours'][year][week][day - 1] += currentWork hoursPassed = (currentTime - data['day_start'] ).total_seconds() / 3600 + PREDICTION_BUFFER dayLength = DAY_END - data['day_start'].hour - data[ 'day_start'].minute / 60 + PREDICTION_BUFFER dayPrediction = dayLength * ( data['work_hours'][year][week][day - 1]) / hoursPassed if day == 1: weekAvg = (sum(data['work_hours'][year][week - 1])) / 7 else: weekAvg = (sum( data['work_hours'][year][week][:day - 1])) / (day - 1) weekprediction = (sum(data['work_hours'][year][week][:day - 1]) + dayPrediction) / day data['timestamps'].append((data['tracking_start'], currentTime)) data['tracking_start'] = None print(day, f'{currentTime.hour}:{currentTime.minute} \n') print( f'{currentWork:.2f} {data["work_hours"][year][week][day-1]:.2f} {dayPrediction:.2f}\n' ) print(f'{weekAvg:.2f} {weekprediction:.2f}') f.seek(0) pickle.dump(data, f)
def get_near_lesson(message): _, group = message.text.split() n = datetime.today() n = datetime.isocalendar(n) time = str(datetime.time(datetime.now()))[0:5] hour_now = int(time[:2]) minute_now = int(time[3:]) day_w = str(n[2]) + 'day' print(day_w) week = what_week(n[1], day_w) web_page = get_page(group, week) if day_w == '7day': times_lst, locations_lst, lessons_lst, rooms_lst = get_schedule(web_page, '1day') resp='<b>{}</b>, {}, {},{}\n'.format(times_lst[0], rooms_lst[0], locations_lst[0], lessons_lst[0]) times_lst, locations_lst, lessons_lst, rooms_lst = get_schedule(web_page, day_w) times_hour = [] times_minute = [] for i in range(len(times_lst)): t=times_lst[i][:5] if t[-1] == '-': t=t[:-1] h ,m = t.split(':') times_hour.append(int(h)) times_minute.append(int(m)) for k in range(len(times_hour)): if hour_now < times_hour[k]: resp = '<b>{}</b>, {}, {},{}\n'.format(times_lst[k], rooms_lst[k], locations_lst[k], lessons_lst[k]) break if hour_now == times_hour[k]: if minute_now <= times_minute[k]: resp = '<b>{}</b>, {}, {},{}\n'.format(times_lst[k], rooms_lst[k], locations_lst[k], lessons_lst[k]) break if hour_now>times_hour[-1]: if day_w == '6day': times_lst, locations_lst, lessons_lst, rooms_lst = get_schedule(web_page, '1day') resp='<b>{}</b>, {}, {},{}\n'.format(times_lst[0], rooms_lst[0], locations_lst[0], lessons_lst[0]) else: times_lst, locations_lst, lessons_lst, rooms_lst = get_schedule(web_page, '{}day'.format(int(day_w[0])+1)) resp='<b>{}</b>, {}, {},{}\n'.format(times_lst[0], rooms_lst[0], locations_lst[0], lessons_lst[0]) bot.send_message(message.chat.id, resp, parse_mode='HTML')
def process_log_data(spark, input_data, output_data): log_data =os.path.join(input_data,"log_data/*/*/*.json") df = spark.read.json(log_data) df= df.where(col("page")=="NextSong") users_table = df['userId', 'firstName', 'lastName', 'gender', 'level','ts'] users_table = users_table.orderBy("ts",ascending=False).dropDuplicates(subset=["userId"]).drop('ts') users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x)/1000)), TimestampType()) get_weekday = udf(lambda x: x.weekday()) get_week = udf(lambda x: datetime.isocalendar(x)[1]) get_hour = udf(lambda x: x.hour) get_day = udf(lambda x : x.day) get_year = udf(lambda x: x.year) get_month = udf(lambda x: x.month) df = df.withColumn('start_time', get_datetime(df.ts)) df = df.withColumn('hour', get_hour(df.start_time)) df = df.withColumn('day', get_day(df.start_time)) df = df.withColumn('week', get_week(df.start_time)) df = df.withColumn('month', get_month(df.start_time)) df = df.withColumn('year', get_year(df.start_time)) df = df.withColumn('weekday', get_weekday(df.start_time)) time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] time_table = time_table.drop_duplicates(subset=['start_time']) time_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite') song_df = spark.read.parquet("results/songs.parquet") df = df.join(song_df, (song_df.title == df.song) & (song_df.artist_name == df.artist)) df = df.withColumn('songplay_id', monotonically_increasing_id()) songplays_table = df['songplay_id','start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'] songplays_table.write.parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite')
def test_dictionary_creation(self): """given query response, is a json dictionary created?""" #create fake response hourint = 1 text = "This is some text" color = "green" date_obj = datetime.strptime("2015, 2, 16", "%Y, %m, %d") date= datetime.date(date_obj) iso_week = datetime.isocalendar(date) day = iso_week[2] user_id = 5000 new_response = Response(user_id=user_id, color=color, date=date, day=day, time_interval=hourint, text=text) #add and commit it to my database db.session.add(new_response) db.session.commit() #query for that data query = db.session.query(Response).filter_by(user_id = 5000).first() print query #write out what that query jsonified would look like json_dict = { "response_id": 1, "day": 2, "words": "This is some text", "hour": 1, "value": "green" } #pass that query data to to_d3_dict for comparison self.assertEqual(query.to_d3_dict()["words"], json_dict["words"]) #rollback database to get rid of new fake response (db.session.rollback()) db.session.rollback()
# python -u $CINGROOT/python/cing/Scripts/interactive/mouseBuffer3.py from datetime import datetime from numpy import * #@UnusedWildImport dt = datetime.now() print datetime.isocalendar(dt) # Below is for a memory test which can show that python doesn't like to do over 2 G in any one chunk but can go higher with multiple chunks. # a floating point in python is iimplemented as a C double # on 32 bit executable this is 64 bits per double; 8 bytes a = ones( (1024,1024,10) ) * 1.1 aSizeInMb = a.size * 8 / ( 1024 * 1024 ) print aSizeInMb v = ones( (1024,1024,10) ) * 1.1 del( a ) # instant release.
def week_number(datetime): return datetime.isocalendar()[1]
def macro_RecentChangeSummury(macro, year=0, week_number=0, comments_only=False): if year == 0 and week_number == 0: year, week_number, wd = datetime.isocalendar(datetime.today()) return getPageListFromLog(macro,year,week_number,comments_only)
def getPageListFromLog (macro, req_year, req_week_number,comments_only): request = macro.request pages = {} oldyw = -1 passed= False for line in editlog.EditLog(request).reverse(): if not request.user.may.read(line.pagename): continue line.time_tuple = request.user.getTime(wikiutil.version2timestamp(line.ed_time_usecs)) year,wn,wd = datetime.isocalendar(datetime.fromtimestamp(time.mktime(line.time_tuple))) yw = '%04d%02d' % (year,wn) if req_year > 0 and req_week_number > 0: if req_week_number == wn and req_year == year: passed = True elif passed and ((req_week_number < wn and req_year == year) or req_year < year): break #for a performance else: continue if not pages.has_key(yw): pages[yw] = {} if pages[yw].has_key(line.pagename): pages[yw][line.pagename].append(line.comment) else: pages[yw][line.pagename] = [line.comment] ret = [] for yw in reversed(sorted(pages.keys())): if len(pages[yw].keys()) > 0: ret.append("WEEK%s, %s" % (yw[-2:], yw[:4])) for page in reversed(sorted(pages[yw].keys(), key=lambda x:len(pages[yw][x]))): edit_cnt = len(pages[yw][page]) comments = filter(lambda x:len(x)>0, pages[yw][page]) p = Page(request, page) if len(comments)>0 or not comments_only: if p.exists(): ret.append(' * [[%s]] (%s)' % (page, str(edit_cnt))) else: ret.append(' * `%s` (%s)' % (page, str(edit_cnt))) for comment in comments: ret.append(' * ' + comment) """ ret.append('<b>WEEK%s, %s</b>'% (yw[-2:],yw[:4])) ret.append('<ol>') for page in reversed(sorted(pages[yw].keys(), key=lambda x:len(pages[yw][x]))): page_link = Page(request,page).link_to(request, '%s(%d) ' % (page,len(pages[yw][page]),), css_class="include-page-link") comments = filter(lambda x:len(x)>0, pages[yw][page]) if comments_only and len(comments)>0: ret.append('<li>'+page_link+'</li>') ret.append('<ul>') for comment in comments: ret.append('<li>' + comment + '</li>') ret.append('</ul>') elif not comments_only: ret.append('<li>'+page_link+'</li>') ret.append('<ul>') for comment in comments: ret.append('<li>' + comment + '</li>') ret.append('</ul>') ret.append('</ol>') """ macro_str = "<<%s(%s)>>" % (macro.name, macro.args) content_str = '\n'.join(ret) form = u'''<form method='post'> <input type='hidden' name='action' value='ReplaceTagAction'> <input type='hidden' name='rsv' value='0'> <input type='hidden' name='regexp' value='0'> <textarea name='tag' style='display:none'>%s</textarea> <textarea name='txt' style='display:none'>%s</textarea> <input type='submit' value=' HARDCOPY TO THIS PAGE '> </form> ''' % (macro_str, content_str) return wikiutil.renderText(request, WikiParser, wikiutil.escape(content_str)) + form
# extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level','ts'] users_table = users_table.orderBy("ts",ascending=False).dropDuplicates(subset=["userId"]).drop('ts') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x : datetime.utcfromtimestamp(int(x)/1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x)/1000)), TimestampType()) get_weekday = udf(lambda x: x.weekday()) get_week = udf(lambda x: datetime.isocalendar(x)[1]) get_hour = udf(lambda x: x.hour) get_day = udf(lambda x : x.day) get_year = udf(lambda x: x.year) get_month = udf(lambda x: x.month) df = df.withColumn('start_time', get_datetime(df.ts)) df = df.withColumn('hour', get_hour(df.start_time)) df = df.withColumn('day', get_day(df.start_time)) df = df.withColumn('week', get_week(df.start_time)) df = df.withColumn('month', get_month(df.start_time)) df = df.withColumn('year', get_year(df.start_time)) df = df.withColumn('weekday', get_weekday(df.start_time)) time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] time_table = time_table.drop_duplicates(subset=['start_time'])
def process_log_data(spark, input_data, output_data): """ Processing log_data Processing song_data from S3 to local directory. Creates dimension tables "users" and "time" and also the fact table "songplays" Params: spark: SparkSession input_data: Root-URL to S3 bucket output_data: Path to local directory Returns: None """ # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "NextSong") # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level', 'ts'] users_table = users_table.orderBy( "ts", ascending=False).dropDuplicates(subset=["userId"]).drop('ts') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x) / 1000)), TimestampType()) get_weekday = udf(lambda x: x.weekday()) get_week = udf(lambda x: datetime.isocalendar(x)[1]) get_hour = udf(lambda x: x.hour) get_day = udf(lambda x: x.day) get_year = udf(lambda x: x.year) get_month = udf(lambda x: x.month) df = df.withColumn('start_time', get_datetime(df.ts)) df = df.withColumn('hour', get_hour(df.start_time)) df = df.withColumn('day', get_day(df.start_time)) df = df.withColumn('week', get_week(df.start_time)) df = df.withColumn('month', get_month(df.start_time)) df = df.withColumn('year', get_year(df.start_time)) df = df.withColumn('weekday', get_weekday(df.start_time)) time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] time_table = time_table.drop_duplicates(subset=['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs.parquet')) # extract columns from joined song and log datasets to create songplays table df = df.join(song_df, (song_df.title == df.song) & (song_df.artist_name == df.artist)) df = df.withColumn('songplay_id', monotonically_increasing_id()) songplays_table = df['songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'] # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet( os.path.join(output_data, 'songplays.parquet'), 'overwrite')
def get_week(datetime): year, week, weekday = datetime.isocalendar() return (year, week)
def process_log_data(spark, input_data, output_data): """Processes the log data Arguments: spark -- The spark session used for computation input_data {str} -- The URI where the data needs to be loaded from output_data {str} -- The URI where the transformed data is saved """ # get filepath to log data file log_data = os.path.join(input_data, "log_data/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level', 'ts'].distinct() # write users table to parquet files users_table.write.parquet(output_data + "users.parquet", "overwrite") # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x) / 1000)), TimestampType()) get_weekday = udf(lambda x: x.weekday()) get_week = udf(lambda x: datetime.isocalendar(x)[1]) get_hour = udf(lambda x: x.hour) get_day = udf(lambda x: x.day) get_year = udf(lambda x: x.year) get_month = udf(lambda x: x.month) df = df.withColumn('start_time', get_datetime(df.ts)) df = df.withColumn('hour', get_hour(df.start_time)) df = df.withColumn('day', get_day(df.start_time)) df = df.withColumn('week', get_week(df.start_time)) df = df.withColumn('month', get_month(df.start_time)) df = df.withColumn('year', get_year(df.start_time)) df = df.withColumn('weekday', get_weekday(df.start_time)) time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time.parquet", "overwrite") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table df = df.join(song_df, (song_df.title == df.song) & (song_df.artist_name == df.artist)) df = df.withColumn('songplay_id', monotonically_increasing_id()) songplays_table = df['songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'].distinct() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays.parquet", "overwrite")
def process_log_data(spark, input_data, output_data): """Process log json data Pulls the raw json log data from s3 and saves 3 tables in parquet format to local Creates fact table songplays, dimension tables users and time songplays fact table takes song.parquet file as input Args: spark: SparkSession Object input_data: s3 bucket url output_data: destination s3 bucket url Returns: None """ # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "Nextsong") # extract columns for users table users_table = df["userId", "firstName", "lastName", "gender", "level", "ts"] # sort table on timestamp then drop timestamp users_table = users_table.orderBy("ts", ascending=False).drop("ts") #drop duplicates users_table = users_table.drop_duplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, "users.parquet"), "overwrite") print("users.parquet file created and saved locally") # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x) / 1000)), TimestampType()) get_weekday = udf(lambda x: x.weekday()) get_week = udf(lambda x: datetime.isocalendar(x)[1]) get_hour = udf(lambda x: x.hour) get_day = udf(lambda x: x.day) get_year = udf(lambda x: x.year) get_month = udf(lambda x: x.month) df = df.withColumn("start_time", get_datetime(df.ts)) # derive further columns from new start_time column df = df.withColumn("weekday", get_weekday(df.start_time)) df = df.withColumn("week", get_week(df.start_time)) df = df.withColumn("hour", get_hour(df.start_time)) df = df.withColumn("day", get_day(df.start_time)) df = df.withColumn("year", get_year(df.start_time)) df = df.withColumn("month", get_month(df.start_time)) # extract columns to create time table time_table = df["start_time", "weekday", "hour", "day", "week", "month", "year"] #remove duplicates time_table = time_table.drop_duplicates() # write time table to parquet files partitioned by year and month time_table = time_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, "time.parquet"), "overwrite") print("time.parquet file written") # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, "songs.parquet")) # extract columns from joined song and log datasets to create songplays table df = df.join(song_df, (song_df.title == df.song) & (song_df.artist_name == df.artist)) df = df.withColumn("songplay_id", monotonically_increasing_id()) songplays_table = df["songplay_id", "start_time", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "year", "month"] # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, "songplays.parquet"), "overwrite") print("songplays.parquet file written, job done!")
def week_from_score(score) -> int: datetime = parsedate(score.findtext("DateTime")) week = datetime.isocalendar()[1] return week
def fmtweek(datetime): (year, week) = datetime.isocalendar()[:2] return '%04dW%02d' % (year, week)
def generate_data_by_date(apple_data_type, dataset_name, data_type): date_dict = dict() for child in root: attr = child.attrib # fild the matching data type if child.tag == 'Record' and attr['type'] == apple_data_type: start_date = datetime.strptime(attr['startDate'], '%Y-%m-%d %H:%M:%S %z') end_date = datetime.strptime(attr['endDate'], '%Y-%m-%d %H:%M:%S %z') #check year if start_date.year == YEAR: # step count & date count = int(attr['value']) date = datetime.strftime(start_date, '%-m/%-d/%Y') # check start and end date if count happens over two or more days if datetime.isocalendar(start_date) != datetime.isocalendar( end_date): # split the count in proportion to duration of before and after midnight midnight = datetime.strftime(datetime.date(end_date), '%Y-%m-%d %H:%M:%S') midnight_time = datetime.strptime(midnight, '%Y-%m-%d %H:%M:%S') till_midnight = (midnight_time - start_date.replace(tzinfo=None)).seconds from_midnight = (end_date.replace(tzinfo=None) - midnight_time).seconds in_the_middle = 0 mid_date_count = (end_date - start_date).days - 1 # more than one day gap, second of the middle days if mid_date_count > 0: in_the_middle = 60 * 60 * 24 * mid_date_count count_before_midnight = round( till_midnight / (till_midnight + in_the_middle + from_midnight) * count) count_after_midnight = round( from_midnight / (till_midnight + in_the_middle + from_midnight) * count) # add count to start and end date date_dict[date] = date_dict[ date] + count_before_midnight if date in date_dict.keys( ) else count_before_midnight if end_date.year == YEAR: next_date = datetime.strftime(end_date, '%-m/%-d/%Y') date_dict[next_date] = date_dict[ next_date] + count_after_midnight if next_date in date_dict.keys( ) else count_after_midnight # add count to the dates evenly distributed to the dates in the middle for i in range(mid_date_count): count_in_a_mid_day = round( (count - count_before_midnight - count_after_midnight) / mid_date_count) mid_datetime = start_date + timedelta(days=(i + 1)) mid_date = datetime.strftime(mid_datetime, '%-m/%-d/%Y') if mid_datetime.year == YEAR: date_dict[mid_date] = date_dict[ mid_date] + count_in_a_mid_day if mid_date in date_dict.keys( ) else count_in_a_mid_day else: date_dict[date] = date_dict[ date] + count if date in date_dict.keys() else count # convert dict to array data_of_year = [] for d in date_dict: data_of_year.append(dict(date=d, value=date_dict[d])) # sort by date; often date isn't ordered in the original data data_of_year = sorted( data_of_year, key=lambda i: datetime.strptime(i['date'], '%m/%d/%Y').timestamp()) # save data as json _savedatasets.save_dataset(data_of_year, _setup.NAME, dataset_name, data_type)