def getDetails(pLinks: list): """Get all the information for specific porject :pLink: List of all the Organization on one page """ project = [] for pLink in pLinks: Link = URL + pLink soup = getPage(Link) title = soup.find('h3', {'class': 'banner__title'}).text orgData = soup.find('main', { 'class': 'app-body' }).find('div', {'class': 'org__meta'}) try: org = orgData.find_all('div')[1].find('div').text studentName = orgData.find_all('div')[3].find('div').text mentors = orgData.find_all('div')[5].find('ul').find_all('li') mentors = [mentor.text for mentor in mentors] project.append({ "Organization": org, "title": title, "student": studentName, "mentors": mentors, "link": pLink }) except AttributeError: print(title) return project
def procesarPeriodos(): url = 'http://encuestas_finales.exactas.uba.ar/periodos.html' page = getPage(url) for tr in page.select_one(".content").select("tr"): td = tr.select('td')[1] href = td.a.attrs["href"] processCuat(base + href)
def main(): orgs_data = {} projects_data = {} for year in range(2005, 2009): url = developer + '/open-source/gsoc/{yr}/'.format(yr=year) soup = getPage(url) orgs, projects = get_info(soup) orgs_data[year] = orgs projects_data[year] = projects dumper(orgs_data, "orgs_2005-2008.json") dumper(projects_data, "projects_2005-2008.json")
def main(): projects = [] for year in range(2016, 2018): for page in range(1, 12): url = URL + '/archive/{yr}/projects/?page={page}'.format(yr=year, page=page) soup = getPage(url) projectLinks = getList(soup)[1:-1] pDetails = getDetails(projectLinks) projects.extend(pDetails) dumper(projects, 'projects_2016-2017') dumper(orgs_info(), 'organizations_2016-2017.json')
def get_project_info(project_urls): """Get detail information of projects from given links :project_urls: list of all the project urls """ project_info = [] for url in project_urls: soup = getPage(url) about = soup.find_all("p") title = soup.find("h3").text student = about[0].text.splitlines()[2].strip() details = about[1].text name = about[0].find("a").text project_info.append({'Organization': name, 'title': title, 'student': student, 'details': details, 'link': url}) return project_info
def orgs_info(): """Get name and links of org fro 2016-2018""" all_org = [] for year in range(2016, 2018): orgs_url = join(URL, "archive/{yr}/organizations/".format(yr=year)) soup = getPage(orgs_url) orgs = soup.findAll('li', attrs={'class': 'organization-card__container'}) for org in orgs: name = org.find('h4').text link = org.find('a').get('href') about = org.find( 'div', { 'class': "organization-card__tagline font-black-54" }).text all_org.append({'link': URL + link, 'name': name, 'about': about}) return all_org
def crawl_daili66(self, page_count=10): """ 爬取66代理数据 :return: host:port """ # 66代理网站分页数据,n.html 即可.页码从1开始 base_url = 'http://www.66ip.cn/{}.html' urls = [base_url.format(page) for page in range(2, page_count+1)] for url in urls: print('正在爬取:', url) html = getPage(url) if html: print('开始解析') # 使用xpath解析页面源代码 html = etree.HTML(html) tr_list = html.xpath('//div[@id="main"]//div[@align="center"]/table//tr') for tr in tr_list[1:]: host = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] yield':'.join([host, port])
def All_orgs(): """Get links of all orgs from 2009 to 2015 Makes two separate list: links_13 - links of all the Organization from 2009-2013 links_14 - links of all the Organization 2014 and 2015 """ links_13 = [] links_14 = [] valid_url = "/?archive/?gsoc/\d+[0-9]/orgs/[a-zA-Z]+" for year in range(2009, 2016): year_url = melange + "/archive/gsoc/{}".format(year) soup = getPage(year_url) for url in soup.find_all('a'): if re.match(valid_url, url.get("href")): if year <= 2013: links_13.append(join(melange, url.get("href")[1:])) else: links_14.append(join(melange, url.get("href")[1:])) return links_13, links_14
def org_info_below_13(org_urls13): """Scrape information about the orgs from 2009-2013 :org_urls13: list of urls for all the orgs """ org_info_till13 = [] project_urls_till13 = [] for url in org_urls13: # General information about the org try: soup = getPage(url) org_name = basename(url) org_info = soup.find_all('p') web_page = org_info[0].text.splitlines()[-1].strip() mailing_list = org_info[1].text.split(":")[-1].strip() detail = org_info[2].text org_info_till13.append({'name': org_name, 'about': detail, 'page': web_page, 'mail': mailing_list, 'link': url}) project_urls_till13.extend(grab_project_links(soup)) except IndexError: print(url) return org_info_till13, get_project_info(project_urls_till13)
def org_info_above_14(orgs_urls14): """Scarpe information about orgs of year 2014 and 2015 :orgs_urls14: list of urls of year 2014 and 2015 """ org_info_14 = [] project_urls_from14 = [] for url in orgs_urls14: try: soup = getPage(url) org_name = basename(url) org_info = soup.find_all('p') web_page = org_info[1].text.splitlines()[-1].strip() mailing_list = org_info[2].text.split(":")[-1].strip() description = soup.find('div', {'class': 'main mdl-cell mdl-cell--8-col\ mdl-card mdl-shadow--4dp'}) detail = description.find_all('p')[2].nextSibling org_info_14.append({'name': org_name, 'page': web_page, 'about': detail, 'mail': mailing_list, 'link': url}) project_urls_from14.extend(grab_project_links(soup)) except IndexError: print(url) return org_info_14, get_project_info(project_urls_from14)