def getProjects(self): # Perhaps I should provide more API endpoints to make scraping easier... projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue() soup = BeautifulSoup(projectlist, "html.parser") links = soup.find("ul", "prjlistclass") projects = [] for link in links.find_all("a"): project = Project() sourceType = None projectURL = self.DOMAIN + link.get("href") projectName = projectURL.split("/")[-2] projectpageHTML = self.curl_get(projectURL).getvalue() projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") sourceURL = projectpageSoup.find(name="a", string="Source").get("href") sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser") sourceSoupText = sourceSoup.get_text() # get source if "git clone" in sourceSoupText: project.repoType = REPO_TYPES.git project.repoURL = "git://beta.datanethost.net/" + projectName + ".git" elif "svn co" in sourceSoupText: project.repoType = REPO_TYPES.SVN project.repoURL = self.DOMAIN + "/svn/" + projectName + "/" else: project.repoType = REPO_TYPES.hg project.repoURL = self.DOMAIN + "/hg/" + projectName + "/" # get downloads project.releases = [] downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser") downloadSection = downlaodsSoup.find("table", "uploads") if "No downloads were found." not in downlaodsSoup.get_text(): downloadRows = downloadSection.find_all("tr")[1:] for downloadRow in downloadRows: cols = downloadRow.find_all("td") downloadTD = cols[0] downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text fileName = downloadTD.a.text release = Release() release.fileURL = downloadURL release.fileName = fileName project.releases.append(release) # get issues project.issues = [] issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser") if "No issues were found." not in issuesSoup.get_text(): issuesSection = issuesSoup.find("table", "recent-issues") for issueRow in issuesSection.find_all("tr")[1:]: issue = Issue() cols = issueRow.find_all("td") issueId = cols[0].text issueURL = projectURL + "issues/" + issueId + "/" issueStatus = cols[2].text issueSummary = cols[1].text issueTitle = cols[1].find("a").text issueAuthor = cols[3].text issue.author = issueAuthor issue.comments = [] issue.status = issueStatus issue.summary = issueSummary issue.title = issueTitle issue.id = issueId # we must go deeper to get comments issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") for comment in issueComments.find_all("div", "issue-comment"): author = comment.find("p").get_text().split("by")[1].split(",")[0] date = comment.find("span").get_text() commentText = comment.find("pre").get_text() issueComment = IssueComment() issueComment.date = date issueComment.author = author issueComment.summary = commentText issue.comments.append(issueComment) project.issues.append(issue) # get wiki pages project.wikis = [] wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser") if "No documentation pages were found." not in wikiSoup.get_text(): wikiSection = wikiSoup.find("table", "recent-issues") for wikiRow in wikiSection.find_all("tr")[1:]: wiki = Wiki() cols = wikiRow.find_all("td") wiki.pageName = cols[0].text wiki.summary = cols[1].text wiki.updated = cols[2].text wikiURL = projectURL + "page/" + wiki.pageName + "/" wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") wikiContent = wikiPageSoup.find(id="wiki-content") wiki.htmlContent = wikiContent.prettify() wiki.textContent = wikiContent.get_text() project.wikis.append(wiki) projects.append(project) return projects
def getProject(self, projectName): project = Project() sourceType = None projectURL = self.DOMAIN + "/p/" + projectName + "/" projectpageHTML = self.curl_get(projectURL).getvalue() projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") sourceURL = projectpageSoup.find(name="a", string="Source").get("href") sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser") sourceSoupText = sourceSoup.get_text() # get source if "git clone" in sourceSoupText: project.repoType = REPO_TYPES.git project.repoURL = "https://code.google.com/p/" + projectName + "/" elif "svn co" in sourceSoupText: project.repoType = REPO_TYPES.SVN project.repoURL = "http://" + projectName + ".googlecode.com/svn/" else: project.repoType = REPO_TYPES.hg project.repoURL = "https://code.google.com/p/" + projectName + "/" # get downloads project.releases = [] downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser") downloadSection = downlaodsSoup.find("table", "results") if "Your search did not generate any results." not in downlaodsSoup.get_text(): downloadRows = downloadSection.find_all("tr")[1:] for downloadRow in downloadRows: cols = downloadRow.find_all("td") downloadTD = cols[1] downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ") fileName = downloadTD.a.text.replace("\n", "").strip(" ") release = Release() release.fileURL = downloadURL release.fileName = fileName project.releases.append(release) # get issues project.issues = [] issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser") if "Your search did not generate any results." not in issuesSoup.get_text(): issuesSection = issuesSoup.find("table", "results") for issueRow in issuesSection.find_all("tr")[1:]: issue = Issue() cols = issueRow.find_all("td") issueId = cols[1].text.replace("\n", "").strip() issueURL = projectURL + "issues/detail?id=" + issueId issueStatus = cols[3].text.replace("\n", "").strip(" ") issueSummary = cols[8].text.replace("\n", "") issueTitle = cols[8].text.replace("\n", "") issueAuthor = cols[5].text.replace("\n", "") #issue.author = issueAuthor issue.comments = [] issue.status = issueStatus.strip(" ") issue.summary = issueSummary.strip(" ") issue.title = issueTitle issue.id = issueId # we must go deeper to get comments issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") for comment in issueComments.find_all("div", "vt"): #author = comment.find(class_="author").find("a").text author = (comment.find(class_="author").find_all("a")[-1]).contents date = comment.find("span", "date")["title"] commentText = comment.find("pre").get_text() issueComment = IssueComment() issueComment.date = date issueComment.author = author issueComment.summary = commentText issue.comments.append(issueComment) project.issues.append(issue) # get wiki pages project.wikis = [] wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser") if "Your search did not generate any results." not in wikiSoup.get_text(): wikiSection = wikiSoup.find("table", "results") for wikiRow in wikiSection.find_all("tr")[1:]: wiki = Wiki() cols = wikiRow.find_all("td") wiki.pageName = cols[1].text.replace("\n", "").strip(" ") wiki.summary = cols[2].text.replace("\n", "").strip(" ") wiki.updated = cols[3].text.replace("\n", "").strip(" ") wikiURL = projectURL + "wiki/" + wiki.pageName wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") wikiContent = wikiPageSoup.find(id="wikicontent") wiki.htmlContent = wikiContent.prettify() wiki.textContent = wikiContent.get_text() project.wikis.append(wiki) return project