def parseCourse(self, course, dow): course = re.compile( '</a.*\n.*br.*">').sub(', ', course) course = re.compile( '<t.*">|<.*.*>').sub('', course) course = re.split('(\n|\t|\r)*' , course) times = ParserUtil.parseTime(course[2], ":") startDate = ParserUtil.parseStartDate(course[8], "/") originalLevel = self.parseOriginalLevel(course[4]) level = self.parseLevel(course[4]) genre = ParserUtil.parseGenre(course[4]) type = ParserUtil.parseType(course[4]) if not type: type = "partnerwork" instructors = re.split(',', course[6]) #name, description, school, level, days, genre, classCount, instructors, type, startDate, startTime, endTime courseLine = self.toJSONFormatCourse(course[4], course[8], self.SCHOOL_NAME, level, originalLevel, [dow], genre, type, self.DEFAULT_COURSE_DAYS_NUM, instructors, startDate, times[0], times[1], ParserUtil.getDuration(times[1], times[0])) return courseLine
def parseCourse(self, course, week): courseSplit = re.split("<br />", course) timeLine = courseSplit[0] if timeLine[0].isdigit(): dow=int(timeLine[0])-1 if not week: dow = dow + 5 else: return name = courseSplit[1].strip() times = ParserUtil.parseTime(timeLine[3:].strip(), '.') startDate = ParserUtil.parseStartDate(courseSplit[2].strip(), ".") if len(courseSplit) > 3: if startDate == '': startDate = ParserUtil.parseStartDate(courseSplit[3], ".") instructors = self.getInstructors(courseSplit) type = ParserUtil.parseType(courseSplit[1]) if not type: type = ParserUtil.parseType(courseSplit[2]) if not type: type = "partnerwork" level = self.parseLevel(courseSplit[1]) if level == "": level = self.parseLevel(courseSplit[2]) if level == "" and type == TYPE_PARTNER : if ParserUtil.getDiffInDaysFromNow(startDate) > 84: level = LEVEL_INTERMEDIATE else: level = LEVEL_BASIC if level == "" and type == TYPE_SOLO : level = LEVEL_OPEN genre = ParserUtil.parseGenre(courseSplit[1]) description = '' courseCount = self.DEFAULT_COURSE_DAYS_NUM if genre and genre[0] == GENRE_ZOUK: courseCount = 2 description = courseSplit[3].strip() + " " + courseSplit[4].strip() dates = re.findall("\d{1,2}.\d{2}", description) pastEvent = True; currentDate = date.today() for dateString in dates: month = int(dateString[3:5]) if month > currentDate.month: pastEvent = False if month == currentDate.month: day = int(dateString[0:2]) if day >= currentDate.day: pastEvent = False if pastEvent: return originalLevel = level #name, description, school, level, days, genre, classCount, instructors, type, startDate, startTime, endTime courseLine = self.toJSONFormatCourse(name, description, self.SCHOOL_NAME, level, originalLevel, [dow], genre, type, courseCount, instructors, startDate, times[0], times[1], ParserUtil.getDuration(times[1], times[0])) return courseLine
def extract(self, soup): evening = range(17, 22) week = dict() days = soup.findAll('div', 'dzien') courses = [] for day in days: name = day.find('strong', 'dayname').text.encode('utf-8') for hour in evening: for course in day.findAll('div', 'godzina godzina%s' % hour): styles = course.findAll('div', 'technika') for style in styles: if style: c = dict() genre = style.find('strong').text parsedGenre = genresLofty[genre.encode('utf-8')] if parsedGenre and parsedGenre[0] != GENRE_PARTY: c = dict(c, genre=parsedGenre) parsedType = style.text.replace(genre, '').encode('utf-8') courseName = genre.encode('utf-8') if parsedType and parsedType!="": courseName = courseName + " (" + parsedType + ")" c = dict(c, name=courseName.replace('&', "and").replace("ż", "z").replace("ń", "n").replace("ę", "e")) if parsedType != "": parsedType = typesLofty[parsedType] if parsedType == "": parsedType = typesLofty[style.text.encode('utf-8')] c = dict(c, type=parsedType) for l in levels.keys(): level = course.find('div', {'class': re.compile(l)}) if level: c = dict(c, level=levels[l]) c = dict(c, originalLevel=levels[l]) break c = dict(c, startTime=int('%s00' % hour)) c = dict(c, endTime=int('%s00' % str(int(hour) + 1))) c = dict(c, duration=60) c = dict(c, classCount=4) instructors = course.find('div', 'instruktorzy') if instructors: instructors = instructors.text.replace(unicode('\xc5\x81', 'utf-8'), 'L') c = dict(c, instructors=instructors.split(',')) start = course.find('div', {'class': re.compile('start')}) if start: c = dict(c, startDate=ParserUtil.parseStartDate(start.text.split()[-1], '.')) c = dict(c, days=[weekdays[name]]) c = dict(c, school=self.SCHOOL_NAME) courses.append(self.toJSONFormatCourse(c["name"], "", c["school"], c["level"], c["originalLevel"], c["days"], c["genre"], c["type"], c["classCount"], c["instructors"], c["startDate"], c["startTime"], c["endTime"], c["duration"])) return courses
def get_courses(self, location, fields): courses = [] for i in location: day_courses = i[2:] if day_courses: for c in day_courses: d = dict() for key, expression in fields.iteritems(): try: d[key]=expression.findall(c)[0] except IndexError: pass if d: try: start = d['hours'].split('-')[0] hour, minute = start.split(':') start_time = int(hour)*60 + int(minute) start = start.replace(':', '') stop = d['hours'].split('-')[1] hour, minute = stop.split(':') stop_time = int(hour)*60 + int(minute) stop = stop.replace(':', '') except KeyError: continue d = dict(d, duration=stop_time-start_time) d = dict(d, startTime=int(start)) d = dict(d, endTime=int(stop)) name = re.sub('\s+',' ', string.join(d['name'], ' ')).strip() progress = d['progress'].split('/')[0] classCount = d['progress'].split('/')[1] wday = weekdays[i[1].encode('utf-8')] duplicate = False for exc in courses: if exc['name'] == name: if progress != '1': duplicate = True if wday not in exc['days']: exc['days'].append(wday) if duplicate is True: continue d = dict(d, name = name) if dict.has_key(d, 'level'): d = dict(d, originalLevel=d['level']) d = dict(d, level=levels[d['level']]) else: if name.find("zaawan") > 0: d = dict(d, level=LEVEL_ADVANCED) elif name.find("podstawowy") > 0: d = dict(d, level=LEVEL_BASIC) else: d = dict(d, level=LEVEL_OPEN) if dict.has_key(d, 'genre'): d = dict(d, genre=[genres[d['genre']]]) else: d = dict(d, genre=[genres[None]]) startDate = '%s/2012' % i[0].replace('.', '/') day, month, year = startDate.split('/') date = datetime.date(int(year), int(month), int(day)) if progress != '1': timedelta = datetime.timedelta(days=(int(progress)-1)*7) date -= timedelta d = dict(d, startDate=date.strftime('%Y-%m-%d')) d = dict(d, days=[wday]) d = dict(d, school=self.SCHOOL_NAME) type = ParserUtil.parseTypeCustomMap(d["name"], typesViva) courses.append(self.toJSONFormatCourse(d["name"], "", d["school"], d["level"], d["level"], d["days"], d["genre"], type, classCount, [], d["startDate"], d["startTime"], d["endTime"], d["duration"])) return courses