def scrape_user(self,url): r = util.getPage(url) if not r: return None, None, None s = BeautifulSoup(r.text) links_html = s.find(id='portfolio-user-links') title = '' links = {} if links_html: for li in links_html.findAll('li'): if not li.findAll('a'): title = util.removeSpaceNewLine(li.get_text()) else: if li.a: nameOfSite = util.removeSpaceNewLine(li.get_text()) urlOfSite = li.a['href'] if nameOfSite.lower() == 'github' or urlOfSite.find('github') > -1: info = self.scrape_github(urlOfSite) links[nameOfSite] = info #elif nameOfSite.lower() == 'twitter' or urlOfSite.find('twitter') > -1: # info = self.scrape_twitter(urlOfSite) elif nameOfSite.lower() == 'linkedin' or urlOfSite.find('linkedin') > -1: info = self.scrape_linked_in(urlOfSite) links[nameOfSite] = info #else: # info = self.scrape_arbitrary(urlOfSite) tags_html = [t for t in s.findAll('ul') if 'class' in t.attrs and 'portfolio-tags' in t['class']] tags = [] if tags_html: tags_html = tags_html[0] for t in tags_html.findAll('li'): if t.a: tags.append(t.a.string) return title, links, tags
def scrape_hackathons(self): names = set() for page in xrange(1,3000): r = util.getPage('http://devpost.com/hackathons?page='+str(page)) if not r: continue s = BeautifulSoup(r.text) for r in s.findAll('article'): if 'class' in r.attrs and 'challenge-listing' in r['class']: href = r.a.get('href').split('.devpost')[0] print href for sub in r.findAll('span'): if 'class' in sub.attrs and 'value' in sub['class'] and 'date-range' in sub['class']: datestring =u"{}".format(sub.text).encode('utf8') formattedDatestring = [] forbidden = False for c in datestring: if ord(c) < 128 and not forbidden: formattedDatestring.append(c) elif ord(c) < 128: if c == ',': forbidden = False else: forbidden = True formattedDatestring = ''.join(formattedDatestring) d = parse(formattedDatestring) print d
def scrape_arbitrary(self,url): r = util.getPage(url) if r: s = BeautifulSoup(r.text) return s.get_text() else: return None
def __get_review_urls(self, area, campsite_id, index, per_page): html = getPage("{}/{}/{}/review/?O1={}&L1={}&".format( self.SITE_URL, area, campsite_id, index, per_page)) soup = BeautifulSoup(html, "html.parser") links = soup.select("p.review_sentence a.more_info") paths = [link.get("href") for link in links] return [self.SITE_URL + path for path in paths]
def scrape_project(self, url): r = util.getPage(url) if not r: return None,None,None s = BeautifulSoup(r.text) hackathon = '' submissions = s.find(id='submissions') if submissions: for sub in submissions.findAll('div'): if 'class' in sub.attrs and 'software-list-content' in sub['class']: hackathon = sub.a['href'] members = {} team = s.find(id='app-team') if team: for sub in team.findAll('li'): if 'class' in sub.attrs and 'software-team-member' in sub['class']: for link in sub.findAll('a'): if 'class' in link.attrs and 'user-profile-link' in link['class']: if link.string: url = link['href'] name = link.string userInfo = self.scrape_user(url) members[name] = userInfo parsedDetails = '' details = s.find(id='app-details-left') if details: for d in details.findAll('div'): if 'id' not in d.attrs or (d['id'] != 'built-with'): parsedDetails += d.get_text() return hackathon, members, parsedDetails
def __get_review(self, url): html = getPage(url) soup = BeautifulSoup(html, "html.parser") dds = soup.select("div.review_text dl dd") if len(dds) > 0: review = ''.join( [d.text.replace('\r\n', '').replace('\n', '') for d in dds]) else: sentences = soup.select("p.review_sentence")[0] review = sentences.text.strip().replace('\r', '').replace('\n', '') return review
def scrape_github(self,url): r = util.getPage(url) if not r: return None totalContributions = 0 numFollowers = 0 numFollowing = 0 numStarred = 0 s = BeautifulSoup(r.text) for div in s.findAll('div'): if 'class' in div.attrs and 'contrib-column-first' in div['class']: for sub in div.findAll('span'): if 'class' in sub.attrs and 'contrib-number' in sub['class']: minusTotal = sub.string.split(" total")[0] minusComma = minusTotal.replace(' ','').replace(',','') totalContributions = int(minusComma) if 'class' in div.attrs and 'vcard-stats' in div['class']: for a in div.findAll('a'): if a['href'].find('followers') > -1: numFollowers = int(a.strong.string.replace(' ','').replace(',','')) elif a['href'].find('stars') > -1: numStarred = int(a.strong.string.replace(' ','').replace(',','')) elif a['href'].find('following') > -1: numFollowing = int(a.strong.string.replace(' ','').replace(',','')) url += '?tab=repositories' r = util.getPage(url) numRepos = 0 if r: s = BeautifulSoup(r.text) for h3 in s.findAll('h3'): if 'class' in h3.attrs and 'repo-list-name' in h3['class']: numRepos += 1 return { 'totalContributions': totalContributions, 'numFollowing': numFollowing, 'numFollowers': numFollowers, 'numStarred': numStarred, 'numRepos': numRepos }
def get_area_list(self): area_list = [] html = getPage("{}".format(self.SITE_URL)) soup = BeautifulSoup(html, "html.parser") search_prefecture_elm = soup.find('div', id='main_search_prefecture') elms = search_prefecture_elm.find_all('dd') for elm in elms: links = elm.find_all('a') for link in links: area = link['href'].split('/')[1] area_list.append(area) # print(len(area_list)) return area_list
def scrape_names(self): names = set() for page in xrange(1,37): r = util.getPage('http://hackmit.devpost.com/participants?page='+str(page)) if not r: continue s = BeautifulSoup(r.text) for r in s.findAll('li'): if 'class' in r.attrs and 'participant-name' in r['class']: noSpaceName = r.text.replace(' ','').replace('\n','') if noSpaceName: name = r.text.replace('\n','') names.add(name) print name return names
def __get_campsite_list_from_page(self, area, start_no, per_page): html = getPage( "{}/{}/list?OFFSET={}&LIMIT={}&display_order=21&".format( self.SITE_URL, area, start_no, per_page)) soup = BeautifulSoup(html, "html.parser") campsite_elms = soup.select("div.block_campsite_list div.camp_list a") campsite_list = [] for elm in campsite_elms: campsite_id = elm.get("href").split('/')[-2] campsite_name = elm.select("h2 span.name")[0].string campsite_name = campsite_name.replace('\u3000', '') campsite_list.append({ 'id': campsite_id, 'name': campsite_name, 'area': area }) return campsite_list
def get_available_dates(self, dates_url): results = {} try: html = util.getPage(dates_url) soup = BeautifulSoup(html, "html.parser") for link in soup.find_all("a"): href = link.get("href") if href != None: if href.startswith("mpSignUp.asp"): date = link.get("title") parsed_date = dateutil.parser.parse(date) format_date = parsed_date.strftime(DATE_FORMAT) results[format_date] = href except: print "exception" return results return results
def scrape_projects(self, startPage = 1, endPage= 1221,pf = 'projects.p'): util.checkPickleFileExistsAndCreate(pf) names = set() projects = [] for page in xrange(startPage,endPage+1): print "working on page:", page r = util.getPage('http://devpost.com/software/search?page='+str(page)) if not r: continue projects_dict= json.loads(r.text) for p in projects_dict['software']: hackathon, members, details = self.scrape_project(p['url']) p['hackathon'] = hackathon p['members'] = members p['details'] = details del p['photo'] del p['slug'] del p['url'] projects.append(p) util.saveObjectsToPickleFile({'projects':projects},pf) return projects
def scrape_twitter(self,url): r = util.getPage(url) return None
def __get_total_review_count(self, area, campsite_id): html = getPage("{}/{}/{}/review".format(self.SITE_URL, area, campsite_id)) soup = BeautifulSoup(html, "html.parser") return int( soup.select("div.review_num span[itemprop='votes']")[0].string)