Beispiel #1
0
    def _store_data(self, year, match):
        if not match:
            return

        # Check if there are any courses to attach the exam to
        try:
            subject = Subject.objects.get(abbreviation=match.groups()[0].upper())
            courses = Course.objects.filter(subject=subject, number__istartswith=match.groups()[1])
            num_courses = courses.count()
        except ObjectDoesNotExist:
            num_courses = 0

        if num_courses < 1:
            print("--No course '{0} {1}' in database".format(match.groups()[0], match.groups()[1]))
            return

        # Find/Create the course <-> data relation(s)
        course_relations = []
        for course in courses:
            temp = existing_or_new(CourseRelation, course=course)
            temp.save(was_scraped=True)
            course_relations.append(temp)

        exam_attrs = {
                    "year": year,
                    "pdf_url": "http://library.queensu.ca.proxy.queensu.ca{0}".format(match.string)}

        exam = existing_or_new(Exam, **exam_attrs)
        for course_relation in course_relations:
            exam.course_rels.add(course_relation)
        exam.save(was_scraped=True)
        print ("--Added exam pdf for {0} course(s): {1}".format(num_courses, ", ".join([str(course) for course in courses])))
Beispiel #2
0
    def _store_data(self, year, match):
        if not match:
            return

        # Check if there are any courses to attach the exam to
        try:
            subject = Subject.objects.get(abbreviation=match.groups()[0].upper())
            courses = Course.objects.filter(subject=subject, number__istartswith=match.groups()[1])
            num_courses = courses.count()
        except ObjectDoesNotExist:
            num_courses = 0

        if num_courses < 1:
            print ("--No course '{0} {1}' in database".format(match.groups()[0], match.groups()[1]))
            return

        # Find/Create the course <-> data relation(s)
        course_relations = []
        for course in courses:
            temp = existing_or_new(CourseRelation, course=course)
            temp.save(was_scraped=True)
            course_relations.append(temp)

        exam_attrs = {
                    "year": year,
                    "pdf_url": "http://library.queensu.ca.proxy.queensu.ca{0}".format(match.string)}

        exam = existing_or_new(Exam, **exam_attrs)
        for course_relation in course_relations:
            exam.course_rels.add(course_relation)
        exam.save(was_scraped=True)
        print ("--Added exam pdf for {0} course(s): {1}".format(num_courses, ", ".join([str(course) for course in courses])))
    def scrape(self):

        print("Starting textbook scrape")

        print("Getting a list of courses")
        r = requests.get("http://www.campusbookstore.com/Textbooks/Booklists/")

        b = BeautifulSoup(r.text)
        content = b.find("div", {"class": "thecontent"})
        links = content.find_all("a")

        temp = []

        for link in links:
            if "campusbookstore.com/Textbooks/Course/" in link.attrs.get("href", ""):
                m = re.search("^(\D+)(\d+).*$", link.string)
                # Only parse letters in config
                if m and m.group(1)[1].upper() in self.config.letters:
                    temp.append((m.group(1), m.group(2), link.attrs["href"]))

        print("Parsing courses")
        for s, c, l in temp:

            # Check if there are any courses to attach the book to
            try:
                subject = Subject.objects.get(abbreviation=s)
                courses = Course.objects.filter(subject=subject, number__istartswith=c)
                num_courses = courses.count()
            except ObjectDoesNotExist:
                num_courses = 0

            if num_courses < 1:
                print("--No course '{0} {1}' in database".format(s, c))
                continue

            # Find/Create the course <-> textbook relation(s)
            course_relations = []
            for course in courses:
                temp = existing_or_new(CourseRelation, course=course)
                temp.save(was_scraped=True)
                course_relations.append(temp)

            print(
                "--Parsing books from {0} course(s): {1}".format(
                    num_courses, ", ".join([str(course) for course in courses])
                )
            )

            r = requests.get(l)
            b = BeautifulSoup(r.text)

            # Looking at the page source, 49 books seems to be the limit (numbers padded the 2 digits)
            for i in range(0, 99, 2):

                book_id = "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ModeFull".format(i)

                book = b.find("div", {"id": book_id})
                if not book:
                    break

                temp = book.find("table").find("table").find_all("td")[1]

                textbook_attrs = {"listing_url": l + "#" + book_id}

                # Title
                title = temp.find(
                    "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookTitle".format(i)}
                ).string
                textbook_attrs["title"] = unicode(title)

                # Authors
                authors = temp.find(
                    "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookAuthor".format(i)}
                ).string
                if authors and authors[:4] == " by ":
                    textbook_attrs["authors"] = authors[4:]

                # Required
                required = temp.find(
                    "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_StatusLabel".format(i)}
                ).string
                if required and "REQUIRED" in required.upper():
                    textbook_attrs["required"] = True

                # ISBN 13
                isbn_13 = temp.find(
                    "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN13Label".format(i)}
                ).string
                if isbn_13 and "[N/A]" in isbn_13:
                    textbook_attrs["isbn_13"] = None
                else:
                    textbook_attrs["isbn_13"] = unicode(isbn_13)

                # ISBN 10
                isbn_10 = temp.find(
                    "span", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN10Label".format(i)}
                ).string
                if isbn_10 and "[N/A]" in isbn_10:
                    textbook_attrs["isbn_10"] = None
                else:
                    textbook_attrs["isbn_10"] = unicode(isbn_10)

                # New data
                new_price = self.price(
                    temp.find(
                        "span",
                        {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewPriceLabel".format(i)},
                    ).string
                )
                new_available = self.num_available(
                    temp.find(
                        "span",
                        {
                            "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewAvailabilityLabel".format(
                                i
                            )
                        },
                    ).string
                )
                if new_price:
                    textbook_attrs["new_price"] = new_price
                if new_available:
                    textbook_attrs["new_available"] = new_available

                # Used data
                used_price = self.price(
                    temp.find(
                        "span",
                        {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedPriceLabel".format(i)},
                    ).string
                )
                used_available = self.num_available(
                    temp.find(
                        "span",
                        {
                            "id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedAvailabilityLabel".format(
                                i
                            )
                        },
                    ).string
                )
                if used_price:
                    textbook_attrs["used_price"] = used_price
                if used_available:
                    textbook_attrs["used_available"] = used_available

                # Classifieds info
                classified_info = temp.find(
                    "a", {"id": "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ClassifiedsLabel".format(i)}
                ).string
                if classified_info:
                    textbook_attrs["classified_info"] = classified_info

                # Add the textbook
                if textbook_attrs["isbn_10"] or textbook_attrs["isbn_13"]:

                    textbook = existing_or_new(Textbook, **textbook_attrs)
                    for course_relation in course_relations:
                        textbook.course_rels.add(course_relation)
                    textbook.save(was_scraped=True)
                    print("----Parsed book: " + str(textbook))
Beispiel #4
0
    def scrape(self):

        print "Starting textbook scrape"

        print "Getting a list of courses"
        r = requests.get("http://www.campusbookstore.com/Textbooks/Booklists/")

        b = BeautifulSoup(r.text)
        content = b.find("div", {"class": "thecontent"})
        links = content.find_all("a")

        temp = []

        for link in links:
            if "campusbookstore.com/Textbooks/Course/" in link.attrs.get(
                    "href", ""):
                m = re.search("^(\D+)(\d+).*$", link.string)
                # Only parse letters in config
                if m and m.group(1)[1].upper() in self.config.letters:
                    temp.append((m.group(1), m.group(2), link.attrs["href"]))

        print("Parsing courses")
        for s, c, l in temp:

            # Check if there are any courses to attach the book to
            try:
                subject = Subject.objects.get(abbreviation=s)
                courses = Course.objects.filter(subject=subject,
                                                number__istartswith=c)
                num_courses = courses.count()
            except ObjectDoesNotExist:
                num_courses = 0

            if num_courses < 1:
                print("--No course '{0} {1}' in database".format(s, c))
                continue

            # Find/Create the course <-> textbook relation(s)
            course_relations = []
            for course in courses:
                temp = existing_or_new(CourseRelation, course=course)
                temp.save(was_scraped=True)
                course_relations.append(temp)

            print("--Parsing books from {0} course(s): {1}".format(
                num_courses, ", ".join([str(course) for course in courses])))

            r = requests.get(l)
            b = BeautifulSoup(r.text)

            # Looking at the page source, 49 books seems to be the limit (numbers padded the 2 digits)
            for i in range(0, 99, 2):

                book_id = "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ModeFull".format(
                    i)

                book = b.find("div", {"id": book_id})
                if not book:
                    break

                temp = book.find("table").find("table").find_all("td")[1]

                textbook_attrs = {"listing_url": l + "#" + book_id}

                # Title
                title = temp.find(
                    "span", {
                        "id":
                        "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookTitle"
                        .format(i)
                    }).string
                textbook_attrs["title"] = unicode(title)

                # Authors
                authors = temp.find(
                    "span", {
                        "id":
                        "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_BookAuthor"
                        .format(i)
                    }).string
                if authors and authors[:4] == " by ":
                    textbook_attrs["authors"] = authors[4:]

                # Required
                required = temp.find(
                    "span", {
                        "id":
                        "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_StatusLabel"
                        .format(i)
                    }).string
                if required and "REQUIRED" in required.upper():
                    textbook_attrs["required"] = True

                # ISBN 13
                isbn_13 = temp.find(
                    "span", {
                        "id":
                        "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN13Label"
                        .format(i)
                    }).string
                if isbn_13 and "[N/A]" in isbn_13:
                    textbook_attrs["isbn_13"] = None
                else:
                    textbook_attrs["isbn_13"] = unicode(isbn_13)

                # ISBN 10
                isbn_10 = temp.find(
                    "span", {
                        "id":
                        "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ISBN10Label"
                        .format(i)
                    }).string
                if isbn_10 and "[N/A]" in isbn_10:
                    textbook_attrs["isbn_10"] = None
                else:
                    textbook_attrs["isbn_10"] = unicode(isbn_10)

                # New data
                new_price = self.price(
                    temp.find(
                        "span", {
                            "id":
                            "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewPriceLabel"
                            .format(i)
                        }).string)
                new_available = self.num_available(
                    temp.find(
                        "span", {
                            "id":
                            "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_NewAvailabilityLabel"
                            .format(i)
                        }).string)
                if new_price:
                    textbook_attrs["new_price"] = new_price
                if new_available:
                    textbook_attrs["new_available"] = new_available

                # Used data
                used_price = self.price(
                    temp.find(
                        "span", {
                            "id":
                            "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedPriceLabel"
                            .format(i)
                        }).string)
                used_available = self.num_available(
                    temp.find(
                        "span", {
                            "id":
                            "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_UsedAvailabilityLabel"
                            .format(i)
                        }).string)
                if used_price:
                    textbook_attrs["used_price"] = used_price
                if used_available:
                    textbook_attrs["used_available"] = used_available

                # Classifieds info
                classified_info = temp.find(
                    "a", {
                        "id":
                        "ctl00_ContentBody_ctl00_CourseBooksRepeater_ctl{:02d}_test_ClassifiedsLabel"
                        .format(i)
                    }).string
                if classified_info:
                    textbook_attrs["classified_info"] = classified_info

                # Add the textbook
                if textbook_attrs["isbn_10"] or textbook_attrs["isbn_13"]:

                    textbook = existing_or_new(Textbook, **textbook_attrs)
                    for course_relation in course_relations:
                        textbook.course_rels.add(course_relation)
                    textbook.save(was_scraped=True)
                    print("----Parsed book: " + str(textbook))
Beispiel #5
0
def existing_or_new_with_time(model, **kwargs):
    kwargs['last_encountered'] = datetime.datetime.now()
    existing = existing_or_new(model, **kwargs)
    return existing