Ejemplo n.º 1
0
    def scrape_user(self,url):
        r = util.getPage(url)
        if not r:
            return None, None, None
        s = BeautifulSoup(r.text)
        links_html = s.find(id='portfolio-user-links')
        title = ''
        links = {}
        if links_html:
            for li in links_html.findAll('li'):
                if not li.findAll('a'):
                    title = util.removeSpaceNewLine(li.get_text())
                else:
                    if li.a:
                        nameOfSite = util.removeSpaceNewLine(li.get_text())
                        urlOfSite = li.a['href']
                        if nameOfSite.lower() == 'github' or urlOfSite.find('github') > -1:
                            info = self.scrape_github(urlOfSite)
                            links[nameOfSite] = info
                        #elif nameOfSite.lower() == 'twitter' or urlOfSite.find('twitter') > -1:
                        #    info = self.scrape_twitter(urlOfSite)
                        elif nameOfSite.lower() == 'linkedin' or urlOfSite.find('linkedin') > -1:
                            info = self.scrape_linked_in(urlOfSite)
                            links[nameOfSite] = info
                        #else:
                        #    info = self.scrape_arbitrary(urlOfSite)

        tags_html = [t for t in s.findAll('ul') if 'class' in t.attrs and 'portfolio-tags' in t['class']]
        tags = []
        if tags_html:
            tags_html = tags_html[0]
            for t in tags_html.findAll('li'):
                if t.a:
                    tags.append(t.a.string)
        return title, links, tags
Ejemplo n.º 2
0
    def scrape_hackathons(self):
        names = set()
        for page in xrange(1,3000):
            r = util.getPage('http://devpost.com/hackathons?page='+str(page))
            if not r:
                continue

            s = BeautifulSoup(r.text)
            for r in s.findAll('article'):
                if 'class' in r.attrs and 'challenge-listing' in r['class']:
                    href = r.a.get('href').split('.devpost')[0]
                    print href
                for sub in r.findAll('span'):
                    if 'class' in sub.attrs and 'value' in sub['class'] and 'date-range' in sub['class']:
                        datestring =u"{}".format(sub.text).encode('utf8')
                        formattedDatestring = []
                        forbidden = False
                        for c in datestring:
                            if ord(c) < 128 and not forbidden:
                                formattedDatestring.append(c)
                            elif ord(c) < 128:
                                if c == ',':
                                    forbidden = False
                            else:
                                forbidden = True

                        formattedDatestring = ''.join(formattedDatestring)
                        d = parse(formattedDatestring)
                        print d
Ejemplo n.º 3
0
 def scrape_arbitrary(self,url):
     r = util.getPage(url)
     if r:
         s = BeautifulSoup(r.text)
         return s.get_text()
     else:
         return None
Ejemplo n.º 4
0
 def __get_review_urls(self, area, campsite_id, index, per_page):
     html = getPage("{}/{}/{}/review/?O1={}&L1={}&".format(
         self.SITE_URL, area, campsite_id, index, per_page))
     soup = BeautifulSoup(html, "html.parser")
     links = soup.select("p.review_sentence a.more_info")
     paths = [link.get("href") for link in links]
     return [self.SITE_URL + path for path in paths]
Ejemplo n.º 5
0
    def scrape_project(self, url):
        r = util.getPage(url)
        if not r:
            return None,None,None
        s = BeautifulSoup(r.text)
        hackathon = ''
        submissions = s.find(id='submissions')
        if submissions:
            for sub in submissions.findAll('div'):
                if 'class' in sub.attrs and 'software-list-content' in sub['class']:
                    hackathon = sub.a['href']
        members = {}
        team = s.find(id='app-team')
        if team:
            for sub in team.findAll('li'):
                if 'class' in sub.attrs and 'software-team-member' in sub['class']:
                    for link in sub.findAll('a'):
                        if 'class' in link.attrs and 'user-profile-link' in link['class']:
                            if link.string:
                                url = link['href']
                                name = link.string
                                userInfo = self.scrape_user(url)
                                members[name] = userInfo
        parsedDetails = ''
        details = s.find(id='app-details-left')
        if details:
            for d in details.findAll('div'):
                if 'id' not in d.attrs or (d['id'] != 'built-with'):
                    parsedDetails += d.get_text()

        return hackathon, members, parsedDetails
Ejemplo n.º 6
0
 def __get_review(self, url):
     html = getPage(url)
     soup = BeautifulSoup(html, "html.parser")
     dds = soup.select("div.review_text dl dd")
     if len(dds) > 0:
         review = ''.join(
             [d.text.replace('\r\n', '').replace('\n', '') for d in dds])
     else:
         sentences = soup.select("p.review_sentence")[0]
         review = sentences.text.strip().replace('\r', '').replace('\n', '')
     return review
Ejemplo n.º 7
0
    def scrape_github(self,url):
        r = util.getPage(url)
        if not r:
            return None
        totalContributions = 0
        numFollowers = 0
        numFollowing = 0
        numStarred = 0
        s = BeautifulSoup(r.text)
        for div in s.findAll('div'):
            if 'class' in div.attrs and 'contrib-column-first' in div['class']:
                for sub in div.findAll('span'):
                    if 'class' in sub.attrs and 'contrib-number' in sub['class']:
                        minusTotal = sub.string.split(" total")[0]
                        minusComma = minusTotal.replace(' ','').replace(',','')
                        totalContributions = int(minusComma)
            if 'class' in div.attrs and 'vcard-stats' in div['class']:
                for a in div.findAll('a'):
                    if a['href'].find('followers') > -1:
                        numFollowers = int(a.strong.string.replace(' ','').replace(',',''))
                    elif a['href'].find('stars') > -1:
                        numStarred = int(a.strong.string.replace(' ','').replace(',',''))
                    elif a['href'].find('following') > -1:
                        numFollowing = int(a.strong.string.replace(' ','').replace(',',''))
        url += '?tab=repositories'
        r = util.getPage(url)
        numRepos = 0
        if r:
            s = BeautifulSoup(r.text)
            for h3 in s.findAll('h3'):
                if 'class' in h3.attrs and 'repo-list-name' in h3['class']:
                    numRepos += 1

        return {
                'totalContributions': totalContributions,
                'numFollowing': numFollowing,
                'numFollowers': numFollowers,
                'numStarred': numStarred,
                'numRepos': numRepos
                }
Ejemplo n.º 8
0
    def get_area_list(self):
        area_list = []

        html = getPage("{}".format(self.SITE_URL))
        soup = BeautifulSoup(html, "html.parser")
        search_prefecture_elm = soup.find('div', id='main_search_prefecture')
        elms = search_prefecture_elm.find_all('dd')
        for elm in elms:
            links = elm.find_all('a')
            for link in links:
                area = link['href'].split('/')[1]
                area_list.append(area)
        # print(len(area_list))
        return area_list
Ejemplo n.º 9
0
    def scrape_names(self):
        names = set()
        for page in xrange(1,37):
            r = util.getPage('http://hackmit.devpost.com/participants?page='+str(page))
            if not r:
                continue

            s = BeautifulSoup(r.text)

            for r in s.findAll('li'):
                if 'class' in r.attrs and 'participant-name' in r['class']:
                    noSpaceName = r.text.replace(' ','').replace('\n','')
                    if noSpaceName:
                        name = r.text.replace('\n','')
                        names.add(name)
                        print name
        return names
Ejemplo n.º 10
0
    def __get_campsite_list_from_page(self, area, start_no, per_page):
        html = getPage(
            "{}/{}/list?OFFSET={}&LIMIT={}&display_order=21&".format(
                self.SITE_URL, area, start_no, per_page))
        soup = BeautifulSoup(html, "html.parser")
        campsite_elms = soup.select("div.block_campsite_list div.camp_list a")

        campsite_list = []
        for elm in campsite_elms:
            campsite_id = elm.get("href").split('/')[-2]
            campsite_name = elm.select("h2 span.name")[0].string
            campsite_name = campsite_name.replace('\u3000', '')
            campsite_list.append({
                'id': campsite_id,
                'name': campsite_name,
                'area': area
            })
        return campsite_list
Ejemplo n.º 11
0
    def get_available_dates(self, dates_url):
        results = {}

        try:
            html = util.getPage(dates_url)
            soup = BeautifulSoup(html, "html.parser")

            for link in soup.find_all("a"):
                href = link.get("href")

                if href != None:
                    if href.startswith("mpSignUp.asp"):
                        date = link.get("title")
                        parsed_date = dateutil.parser.parse(date)
                        format_date = parsed_date.strftime(DATE_FORMAT)
                        results[format_date] = href

        except:
            print "exception"
            return results

        return results
Ejemplo n.º 12
0
    def scrape_projects(self, startPage = 1, endPage= 1221,pf = 'projects.p'):

        util.checkPickleFileExistsAndCreate(pf)
        names = set()
        projects = []
        for page in xrange(startPage,endPage+1):
            print "working on page:", page
            r = util.getPage('http://devpost.com/software/search?page='+str(page))
            if not r:
                continue

            projects_dict= json.loads(r.text)
            for p in projects_dict['software']:
                hackathon, members, details = self.scrape_project(p['url'])
                p['hackathon'] = hackathon
                p['members'] = members
                p['details'] = details
                del p['photo']
                del p['slug']
                del p['url']
                projects.append(p)
        util.saveObjectsToPickleFile({'projects':projects},pf)
        return projects
Ejemplo n.º 13
0
 def scrape_twitter(self,url):
     r = util.getPage(url)
     return None
Ejemplo n.º 14
0
 def __get_total_review_count(self, area, campsite_id):
     html = getPage("{}/{}/{}/review".format(self.SITE_URL, area,
                                             campsite_id))
     soup = BeautifulSoup(html, "html.parser")
     return int(
         soup.select("div.review_num span[itemprop='votes']")[0].string)