def open_spider(self, spider): # designate table and fields to populate if spider.name in ['techmeme']: self.table = 'du_agg_news' self.cols = ['title', 'link', 'blurb', 'src'] elif spider.name in ['github']: self.table = 'du_agg_projects' self.cols = ['title', 'link', 'blurb', 'lang', 'updated', 'stars', 'forks', 'src'] elif spider.name in ['coursera']: self.table = 'du_agg_courses' self.cols = ['title', 'link', 'blurb', 'school', 'school_link', 'course_date', 'course_length', 'src'] elif spider.name in ['meetup']: self.table = 'du_agg_events' self.cols = ['title', 'link', 'blurb', 'host', 'location', 'event_date', 'event_time', 'src'] else: utils.devlog('Cannot get database table for type %s' % spider.name, 'e') self.crawler.engine.close_spider(spider, 'Closed spider -- cannot get appropriate database table.') # connect to database try: urlparse.uses_netloc.append('postgres') # set parsing scheme url = urlparse.urlparse(os.environ['DATABASE_URL']) self.conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s " % (url.path[1:], url.username, url.password, url.hostname)) self.cur = self.conn.cursor() except KeyError, e: utils.devlog("No DATABASE_URL is set - cannot get database connection information.", 'e') self.crawler.engine.close_spider(spider, 'Closed spider -- cannot get database connection information.')
def process_item(self, item, spider): if item['link']: if item['link'] in self.pages_seen: utils.devlog("Link '%s' is a duplicate!" % item['link'], 'w') raise DropItem("Duplicate link found: %s" % item) else: self.pages_seen.add(item['link']) return item
def parseItem(self, response): utils.devlog("Beginning on new item...") hxs = HtmlXPathSelector(response) item = EventItem() item['link'] = response.url item['title'] = hxs.select('//div[contains(@id, "event-title")]/@data-name').extract()[0] item['event_date'] = "BLAH" if not hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-when")]//time[contains(@id, "event-start-time")]/p[1]/text()').extract() else "HAPPY" #item['location'] = hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-where")]/@data-name').extract()[0] #item['location'] += hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-where")]/@data-address').extract()[0] #item['blurb'] += hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-desc")]//p/text()').extract()[0] #item['host'] = hxs.select('//div/a[contains(@class, "chapter-name")]/text()').extract()[0] item['src'] = "events" self.item_cnt += 1 yield item
def parse(self, response): courses = json.loads(response.body) # grab the JSON from the API search for course in courses: if self.cats[0] in course['category-ids']: item = CourseItem() item['title'] = course['name'] item['blurb'] = course['short_description'] item['link'] = urlparse.urljoin("https://www.coursera.org/course/", course['short_name']) item['school'] = course['universities'][0]['name'] item['school_link'] = urlparse.urljoin("http://coursera.org/", course['universities'][0]['short_name']) item['course_date'] = "TBA" if course['courses'][0]['start_date_string'] in [None, ""] else course['courses'][0]['start_date_string'] item['course_length'] = "TBA" if course['courses'][0]['duration_string'] in [None, ""] else course['courses'][0]['duration_string'] item['src'] = "courses" self.item_cnt += 1 yield item utils.devlog("All done... total items is %d" % self.item_cnt)
def parse(self, response): utils.devlog('Parsing page... depth is %s' % response.meta['depth']) # grab info from projects on this page #@@@ stars and forks do not work! hxs = HtmlXPathSelector(response) for project in hxs.select('//li[contains(@class, "public")]'): item = ProjectItem() item['blurb'] = "" if not project.select('.//p[contains(@class, "description")]/text()').extract() else project.select('.//p[contains(@class, "description")]/text()').extract()[0] item['title'] = project.select('./h3/a/text()').extract()[0] item['link'] = urlparse.urljoin("https://github.com", project.select('./h3/a/@href').extract()[0]) item['lang'] = project.select('./ul/li[1]/text()').extract()[0] item['updated'] = project.select('.//p[contains(@class, "updated-at")]/time/@title').extract()[0] item['stars'] = project.select('.//li[contains(@class, "stargazers")]/a/@href').extract()[0] item['forks'] = project.select('.//li[contains(@class, "forks")]/a/text()').extract()[0] item['src'] = "projects" print "\nNUM STARS: %s" % item['stars'] sys.exit(1) if item['stars'] == 0: print "NO STARS!!!!!!!! (%s)" % item['stars'] sys.exit(1) #print unicode(item['title']).encode('utf8') self.item_cnt += 1 yield item # try to parse the next page try: nextPageLink = hxs.select('//div[contains(@class, "pagination")]/a[contains(@class, "next_page")]/@href').extract()[0] nextPageLink = urlparse.urljoin(response.url, nextPageLink) utils.devlog("Moving onto next page: link is %s" % nextPageLink) yield Request(nextPageLink, callback = self.parse) except: utils.devlog("I have reached the last page... total items is %d" % self.item_cnt)
def process_item(self, item, spider): try: # attempt to get get spider source self.cur.execute("select id from du_agg_sources where slug=%s;", (spider.name,)) if not self.cur.rowcount: return item else: item['src'] = self.cur.fetchone()[0] #@@@ attempt to get language ref if necessary if (spider.name == 'github') : item['stars'] = 0 if not item['stars'] else item['stars'] item['forks'] = 0 if not item['stars'] else item['forks'] self.cur.execute("select id from du_agg_languages where slug = %s or lower(title) = %s;", (item['lang'].lower(), item['lang'].lower())) if not self.cur.rowcount: self.bad += 1 return item else: item['lang'] = self.cur.fetchone()[0] # populate dict with fields/vals keysvals = dict.fromkeys(self.cols) for col in keysvals: keysvals[col] = item[col] # generate SQL and insert into the DB datarep = ("%s," * len(keysvals)).rstrip(',') sql = "insert into %s (%s) values (%s);" % (self.table, ', '.join(keysvals.keys()), datarep) self.cur.execute(sql, keysvals.values()) #utils.devlog("QRY: %s" % self.cur.query) self.conn.commit() # maybe should commit in close_spider instead of for each item except psycopg2.DatabaseError, e: if self.conn: self.conn.rollback() self.bad += 1 utils.devlog("Failed to store itemed entitled '%s' via %s spider: %s" % (item['title'], spider.name, e))
def close_spider(self, spider): if self.conn: self.conn.close() utils.devlog("Number of items not stored: %s" % self.bad)
def parse(self, response): depth = response.meta['depth'] utils.devlog('depth is %s' % depth) # when to quit!! if depth > 60: utils.devlog("I have crawled enough pages... total items is %d" % self.item_cnt) return # generate a cutoff date so we don't search for meetups in the distant future now = datetime.now() cutoff = now + timedelta(weeks=12) utils.devlog("The cutoff date is %s" % cutoff.strftime("%B %d, %Y")) # grab info from events on this page hxs = HtmlXPathSelector(response) for event in hxs.select('//ul[contains(@class, "event-listing-container")]/li[contains(@class, "event-listing")]'): # stop at cutoff date so we don't get too many date = datetime(int(event.select('@data-year').extract()[0]), int(event.select('@data-month').extract()[0]), int(event.select('@data-day').extract()[0])) if date > cutoff: utils.devlog("I have reached the cutoff date... total items is %d" % self.item_cnt) return # not yet in use - pages are not standardized enough! #link = event.select('./a[contains(@class, "list-time")]/@href').extract()[0] #yield Request(link, callback = self.parseItem) item = EventItem() item['link'] = event.select('./a[contains(@class, "list-time")]/@href').extract()[0] item['event_date'] = date.strftime("%B %d, %Y") item['event_time'] = event.select('./a[contains(@class, "list-time")]/text()').extract()[0] item['host'] = event.select('./div/a[contains(@class, "chapter-name")]/text()').extract()[0] item['title'] = event.select('./div/h4/a[contains(@class, "event-title")]/text()').extract()[0] item['blurb'] = "" item['location'] = "TBA" item['src'] = "events" self.item_cnt += 1 yield item # try to parse the next page # There is a bug on the meetup site such that the Next link in the HTML is wrong. # Therefore I am generating the crawl link manually. try: p = re.compile('currentpage=([0-9]+)') currentPage = int(p.search(response.url).group(1)) nextPage = currentPage + 1; offset = currentPage * 64 nextPageLink = "http://www.meetup.com/find/?offset=%s&psize=64¤tpage=%s&categories=34&radius=10&userFreeform=San+Francisco&events=true&sort=default" % (offset, nextPage) #nextPageLink = hxs.select('//div[contains(@class, "simple-infinite-pager")]/a/@href').extract()[0] #nextPageLink = urlparse.urljoin(response.url, nextPageLink) utils.devlog("Moving onto next page: offset is %s and nextpage is %s which yields link %s" % (offset, nextPage, nextPageLink)) yield Request(nextPageLink, callback = self.parse) except: utils.devlog("Failed to fetch next page to crawl... total items is %d" % self.item_cnt)