Example #1
0
def getDetails(pLinks: list):
    """Get all the information for specific porject
        :pLink: List of all the Organization on one page
    """
    project = []
    for pLink in pLinks:
        Link = URL + pLink
        soup = getPage(Link)
        title = soup.find('h3', {'class': 'banner__title'}).text
        orgData = soup.find('main', {
            'class': 'app-body'
        }).find('div', {'class': 'org__meta'})
        try:
            org = orgData.find_all('div')[1].find('div').text
            studentName = orgData.find_all('div')[3].find('div').text
            mentors = orgData.find_all('div')[5].find('ul').find_all('li')
            mentors = [mentor.text for mentor in mentors]
            project.append({
                "Organization": org,
                "title": title,
                "student": studentName,
                "mentors": mentors,
                "link": pLink
            })
        except AttributeError:
            print(title)

    return project
def procesarPeriodos():
    url = 'http://encuestas_finales.exactas.uba.ar/periodos.html'
    page = getPage(url)
    for tr in page.select_one(".content").select("tr"):
        td = tr.select('td')[1]
        href = td.a.attrs["href"]
        processCuat(base + href)
Example #3
0
def main():
    orgs_data = {}
    projects_data = {}
    for year in range(2005, 2009):
        url = developer + '/open-source/gsoc/{yr}/'.format(yr=year)
        soup = getPage(url)
        orgs, projects = get_info(soup)
        orgs_data[year] = orgs
        projects_data[year] = projects
    dumper(orgs_data, "orgs_2005-2008.json")
    dumper(projects_data, "projects_2005-2008.json")
Example #4
0
def main():
    projects = []
    for year in range(2016, 2018):
        for page in range(1, 12):
            url = URL + '/archive/{yr}/projects/?page={page}'.format(yr=year,
                                                                     page=page)
            soup = getPage(url)
            projectLinks = getList(soup)[1:-1]
            pDetails = getDetails(projectLinks)
            projects.extend(pDetails)
    dumper(projects, 'projects_2016-2017')
    dumper(orgs_info(), 'organizations_2016-2017.json')
Example #5
0
def get_project_info(project_urls):
    """Get detail information of projects from given links
        :project_urls: list of all the project urls
    """
    project_info = []
    for url in project_urls:
        soup = getPage(url)
        about = soup.find_all("p")
        title = soup.find("h3").text
        student = about[0].text.splitlines()[2].strip()
        details = about[1].text
        name = about[0].find("a").text
        project_info.append({'Organization': name, 'title': title,
                             'student': student, 'details': details,
                             'link': url})

    return project_info
Example #6
0
def orgs_info():
    """Get name and links of org fro 2016-2018"""

    all_org = []
    for year in range(2016, 2018):
        orgs_url = join(URL, "archive/{yr}/organizations/".format(yr=year))
        soup = getPage(orgs_url)
        orgs = soup.findAll('li',
                            attrs={'class': 'organization-card__container'})

        for org in orgs:
            name = org.find('h4').text
            link = org.find('a').get('href')
            about = org.find(
                'div', {
                    'class': "organization-card__tagline font-black-54"
                }).text
            all_org.append({'link': URL + link, 'name': name, 'about': about})

    return all_org
Example #7
0
	def crawl_daili66(self, page_count=10):
		"""
		爬取66代理数据
		:return: host:port
		"""
		# 66代理网站分页数据,n.html 即可.页码从1开始
		base_url = 'http://www.66ip.cn/{}.html'
		urls = [base_url.format(page) for page in range(2, page_count+1)]
		for url in urls:
			print('正在爬取:', url)
			html = getPage(url)
			if html:
				print('开始解析')
				# 使用xpath解析页面源代码
				html = etree.HTML(html)
				tr_list = html.xpath('//div[@id="main"]//div[@align="center"]/table//tr')
				
				for tr in tr_list[1:]:
					host = tr.xpath('./td[1]/text()')[0]
					port = tr.xpath('./td[2]/text()')[0]
					yield':'.join([host, port])
Example #8
0
def All_orgs():
    """Get links of all orgs from 2009 to 2015

        Makes two separate list:
        links_13 - links of all the Organization from 2009-2013
        links_14 - links of all the Organization 2014 and 2015
    """

    links_13 = []
    links_14 = []
    valid_url = "/?archive/?gsoc/\d+[0-9]/orgs/[a-zA-Z]+"
    for year in range(2009, 2016):
        year_url = melange + "/archive/gsoc/{}".format(year)
        soup = getPage(year_url)

        for url in soup.find_all('a'):
            if re.match(valid_url, url.get("href")):
                if year <= 2013:
                    links_13.append(join(melange, url.get("href")[1:]))
                else:
                    links_14.append(join(melange, url.get("href")[1:]))
    return links_13, links_14
Example #9
0
def org_info_below_13(org_urls13):
    """Scrape information about the orgs from 2009-2013
        :org_urls13: list of urls for all the orgs
    """
    org_info_till13 = []
    project_urls_till13 = []
    for url in org_urls13:
        # General information about the org
        try:
            soup = getPage(url)
            org_name = basename(url)
            org_info = soup.find_all('p')
            web_page = org_info[0].text.splitlines()[-1].strip()
            mailing_list = org_info[1].text.split(":")[-1].strip()
            detail = org_info[2].text
            org_info_till13.append({'name': org_name, 'about': detail,
                                    'page': web_page, 'mail': mailing_list,
                                    'link': url})
            project_urls_till13.extend(grab_project_links(soup))

        except IndexError:
            print(url)

    return org_info_till13, get_project_info(project_urls_till13)
Example #10
0
def org_info_above_14(orgs_urls14):
    """Scarpe information about orgs of year 2014 and 2015
        :orgs_urls14: list of urls of year 2014 and 2015
    """
    org_info_14 = []
    project_urls_from14 = []
    for url in orgs_urls14:
        try:
            soup = getPage(url)
            org_name = basename(url)
            org_info = soup.find_all('p')
            web_page = org_info[1].text.splitlines()[-1].strip()
            mailing_list = org_info[2].text.split(":")[-1].strip()
            description = soup.find('div', {'class': 'main mdl-cell mdl-cell--8-col\
                                             mdl-card mdl-shadow--4dp'})
            detail = description.find_all('p')[2].nextSibling
            org_info_14.append({'name': org_name, 'page': web_page,
                                'about': detail, 'mail': mailing_list,
                                'link': url})
            project_urls_from14.extend(grab_project_links(soup))
        except IndexError:
            print(url)

    return org_info_14, get_project_info(project_urls_from14)