Python sanitize Examples, sanitizr.sanitize Python Examples

Example #1

0

Show file

File: courseprocesser.py Project: skairunner/JeevesCoursePlanner

def processcourse(course):
    soup = BS(course["table"], "html5lib")
    for tag in soup("p"):
        tag.unwrap();

    course["name"]  = re.search(r"[\w]+-[\w]+\s+[\w]+", course["header"]).group(0)
    # print(course["header"].split(course["name"] + " "), course["name"])
    try:
        course["title"] = course["header"].split(course["name"] + " ")[1]
    except IndexError as e:
        course["title"] = course["header"].split(course["name"])[1]
    except:
        raise e
    # A dirty hack to easily find the innermost <td>s
    cmps = soup("td", style="background-color: white; font-family: arial; font-size: 12px;")
    course["components"] = []
    notes = []
    components = course["components"]
    componentTypes = set()
    for td in cmps:
        components.append({})
        component = components[-1]

        lines = []
        line = []
        for tag in td:
            if tag.name == "br":
                lines.append(line)
                line = []
            else:
                line.append(tag)
        if len(line) > 0:
            lines.append(line)

        s = 0 # starting line #
        # First line has course name, units, #, section
        flattened = stringFromTags(lines[s])
        if "Topic:" in flattened:
            for n in range(len(lines[s])):
                if "Topic:" in lines[s][n]:
                    component["topic"] = lines[s][n].text.split("Topic: ")[-1]
                    print(component["topic"])
                    break

        flattened = stringFromTags(lines[s])
        r = re.search(r"(\d+) units", flattened)
        if r:
            component["units"] = int(r.group(1))            
        r = re.search(r"Class#: (\d+)", flattened)
        if r:
            component["number"] = int(r.group(1))
        r = re.search(r"Section: (\w+)", flattened)
        if r:
            component["section"] = r.group(1)
        # second line has nothing important.
        # third line has location and component
        flattened = stringFromTags(lines[s+2])
        r = re.search(r"Location: (\w+)", flattened)
        if r:
            component["location"] = r.group(1)
        r = re.search(r"Component: (\w+)", flattened)
        if r:
            component["componentType"] = r.group(1)
            componentTypes.add(r.group(1))
        # Forth line has times and instructor.
        # Fifth line might have times, or it might have notes.
        # basically: figure out if the line is Notes or not, then figure out if
        # it's a test date or a proper date.
        for i in range(s+3, len(lines)):
            flattened = stringFromTags(lines[i])
            if "Notes:" in flattened:
                if "notes" in component:
                    component["notes"] += "\n" + flattened.split("Notes:")[1].strip()
                else:
                    component["notes"] = flattened.split("Notes:")[1].strip()
                notes.append(component["notes"])
            else:
                # possibly a date.
                r = re.findall(r"\d{2}\/\d{2}\/\d{4}", flattened)
                if len(r) == 2:
                    if r[0] == r[1]:
                        continue # it's a test date.
                    # otherwise add to class times.
                    # date is 24 characters long.
                    flattened = flattened[24:]
                    # might have a name in it
                    if "with" in flattened:
                        component["instructor"] = flattened.split("with ")[-1].strip()
                        index = flattened.index(" with")
                        flattened = flattened[:index]
                    # extract the times
                    times = re.search(r"((\w{3},?)+) (\d{1,2})\.(\d{2}) (\w{2}) - (\d{1,2})\.(\d{2}) (\w{2})", flattened)
                    if times:
                        if "notes" in component:
                            component["notes"] += "\n" + times.group(0)
                        else:
                            component["notes"] = times.group(0)
                        if "classtimes" not in component:
                            component["classtimes"] = []
                        startdays = times.group(1)
                        startH = times.group(3); startH = int(startH)
                        startM = times.group(4); startM = int(startM)
                        startAP = times.group(5)
                        endH = times.group(6); endH = int(endH)
                        endM = times.group(7); endM = int(endM)
                        endAP = times.group(8)

                        if startAP == "PM" and startH != 12:
                            startH += 12
                        if endAP == "PM" and endH != 12:
                            endH += 12

                        startdays = startdays.split(",")
                        for day in startdays:
                            classtime = {
                                "day": DayToNumber[day],
                                "starttime": [startH, startM],
                                "endtime": [endH, endM]
                            }
                            component["classtimes"].append(classtime)

    # Next, create the searchable corpus of the course.
    searchable = [course["title"], course["desc"]]
    units = []
    coursenumbers = []
    for component in components:
        if "topic" in component:
            searchable.append(component["topic"])
        if "notes" in component:
            searchable.append(component["notes"])
        if "instructor" in component:
            searchable.append(component["instructor"])
        if "location" in component:
            searchable.append(component["location"])
        if "units" in component:
            units.append(str(component["units"]) + " units") # separate because sanitization
        coursenumbers.append(str(component["number"]))
    searchableStr = sanitize(" ".join(searchable))
    searchableStr = " ".join([course["name"].lower(), " ".join(coursenumbers), "".join(units), searchableStr])
    course["searchable"] = searchableStr
    
    # Add required components
    course["requiredcomponents"] = list(componentTypes)

    # Clean up
    del course["table"]
    del course["header"]

Example #2

0

Show file

File: courseprocesser.py Project: Skyyrunner/JeevesCoursePlanner

def processcourse(course):
    soup = BS(course["table"], "html5lib")
    for tag in soup("p"):
        tag.unwrap();

    course["name"]  = re.search(r"[\w]+-[\w]+\s+[\w]+", course["header"]).group(0)
    # print(course["header"].split(course["name"] + " "), course["name"])
    try:
        course["title"] = course["header"].split(course["name"] + " ")[1]
    except IndexError as e:
        course["title"] = course["header"].split(course["name"])[1]
    except:
        raise e
    # A dirty hack to easily find the innermost <td>s
    cmps = soup("td", style="background-color: white; font-family: arial; font-size: 12px;")
    course["components"] = []
    notes = []
    components = course["components"]
    componentTypes = set()
    for td in cmps:
        components.append({})
        component = components[-1]

        lines = []
        line = []
        for tag in td:
            if tag.name == "br":
                lines.append(line)
                line = []
            else:
                line.append(tag)
        if len(line) > 0:
            lines.append(line)

        s = 0 # starting line #
        # First line has course name, units, #, section
        flattened = stringFromTags(lines[s])
        if "Topic:" in flattened:
            for n in range(len(lines[s])):
                if "Topic:" in lines[s][n]:
                    component["topic"] = lines[s][n].text.split("Topic: ")[-1]
                    print(component["topic"])
                    break

        flattened = stringFromTags(lines[s])
        r = re.search(r"(\d+) units", flattened)
        if r:
            component["units"] = int(r.group(1))            
        r = re.search(r"Class#: (\d+)", flattened)
        if r:
            component["number"] = int(r.group(1))
        r = re.search(r"Section: (\w+)", flattened)
        if r:
            component["section"] = r.group(1)
        # second line has nothing important.
        # third line has location and component
        flattened = stringFromTags(lines[s+2])
        r = re.search(r"Location: (\w+)", flattened)
        if r:
            component["location"] = r.group(1)
        r = re.search(r"Component: (\w+)", flattened)
        if r:
            component["componentType"] = r.group(1)
            componentTypes.add(r.group(1))
        # Forth line has times and instructor.
        # Fifth line might have times, or it might have notes.
        # basically: figure out if the line is Notes or not, then figure out if
        # it's a test date or a proper date.
        for i in range(s+3, len(lines)):
            flattened = stringFromTags(lines[i])
            if "Notes:" in flattened:
                if "notes" in component:
                    component["notes"] += "\n" + flattened.split("Notes:")[1].strip()
                else:
                    component["notes"] = flattened.split("Notes:")[1].strip()
                notes.append(component["notes"])
            else:
                # possibly a date.
                r = re.findall(r"\d{2}\/\d{2}\/\d{4}", flattened)
                if len(r) == 2:
                    if r[0] == r[1]:
                        continue # it's a test date.
                    # otherwise add to class times.
                    # date is 24 characters long.
                    flattened = flattened[24:]
                    # might have a name in it
                    if "with" in flattened:
                        component["instructor"] = flattened.split("with ")[-1].strip()
                        index = flattened.index(" with")
                        flattened = flattened[:index]
                    # extract the times
                    times = re.search(r"((\w{3},?)+) (\d{1,2})\.(\d{2}) (\w{2}) - (\d{1,2})\.(\d{2}) (\w{2})", flattened)
                    if times:
                        if "notes" in component:
                            component["notes"] += "\n" + times.group(0)
                        else:
                            component["notes"] = times.group(0)
                        if "classtimes" not in component:
                            component["classtimes"] = []
                        startdays = times.group(1)
                        startH = times.group(3); startH = int(startH)
                        startM = times.group(4); startM = int(startM)
                        startAP = times.group(5)
                        endH = times.group(6); endH = int(endH)
                        endM = times.group(7); endM = int(endM)
                        endAP = times.group(8)

                        if startAP == "PM" and startH != 12:
                            startH += 12
                        if endAP == "PM" and endH != 12:
                            endH += 12

                        startdays = startdays.split(",")
                        for day in startdays:
                            classtime = {
                                "day": DayToNumber[day],
                                "starttime": [startH, startM],
                                "endtime": [endH, endM]
                            }
                            component["classtimes"].append(classtime)

    # Next, create the searchable corpus of the course.
    searchable = [course["title"], course["desc"]]
    units = []
    coursenumbers = []
    for component in components:
        if "topic" in component:
            searchable.append(component["topic"])
        if "notes" in component:
            searchable.append(component["notes"])
        if "instructor" in component:
            searchable.append(component["instructor"])
        if "location" in component:
            searchable.append(component["location"])
        if "units" in component:
            units.append(str(component["units"]) + " units") # separate because sanitization
        coursenumbers.append(str(component["number"]))
    searchableStr = sanitize(" ".join(searchable))
    searchableStr = " ".join([course["name"].lower(), " ".join(coursenumbers), "".join(units), searchableStr])
    course["searchable"] = searchableStr
    
    # Add required components
    course["requiredcomponents"] = list(componentTypes)

    # Clean up
    del course["table"]
    del course["header"]

Example #3

0

Show file

File: courseprocesser.py Project: skairunner/JeevesCoursePlanner

def addWordsToSet(line, s):
    # line = unicode(line)
    line = sanitize(line)
    line = line.split()
    for x in line:
        s.add(x)

Example #4

0

Show file

File: courseprocesser.py Project: Skyyrunner/JeevesCoursePlanner

def addWordsToSet(line, s):
    # line = unicode(line)
    line = sanitize(line)
    line = line.split()
    for x in line:
        s.add(x)