def gradeWiki(partId, ch_aux): import sys original_stdout = sys.stdout sys.stdout = NullDevice() wiki = Wiki() if partId == 1: wivesFile = '../data/wives.txt' goldFile = '../data/gold.txt' wives = wiki.addWives(wivesFile) else: wives = ch_aux.split('\n') wives = wives[1:] wikiFile = '../data/small-wiki.xml' infoBoxHusbands = wiki.processFile(open(wikiFile), wives, True) modifyWiki(ch_aux[0], wikiFile) mod_file = open('wiki_mod') noInfoHusbands = wiki.processFile(mod_file, wives, False) #os.remove('wiki_mod') sys.stdout = original_stdout if partId == 1: infoScore = evaluateAnswers(infoBoxHusbands, goldFile) noInfoScore = evaluateAnswers(noInfoHusbands, goldFile) return (infoScore + noInfoScore) else: return encodeWiki(infoBoxHusbands, noInfoHusbands)
def __init__(self, *args, **kwargs): ExtensionController.__init__(self, *args, **kwargs) wikidir = os.path.join(self.api.get_data_dir(), 'warehouse') self.wiki = Wiki(self.api.get_db(), directory = wikidir, wiki_word_handler = self.__wiki_word_handler, wiki_url_handler = self.__wiki_url_handler)
def __processEpisodes(self): import re from Episode import Episode from Wiki import Wiki, WikiLookupError seasonData = self.__seasonData deeplink = self.__getSeasonDeeplink() if (deeplink != None): wiki = Wiki() seasonData = wiki.query(deeplink) self.__seasonData = seasonData episodeRegex = re.compile('{{Episode list', re.IGNORECASE) foundEpisodes = episodeRegex.finditer(seasonData) if (foundEpisodes == None): raise WikiLookupError('Could not find \'{{Episode List\' tag') episodeStart = 0 try: episodeIndex = foundEpisodes.next() episodeStart = episodeIndex.start() while (1): episodeIndex = foundEpisodes.next() episodeEnd = episodeIndex.start() self.__processEpisodeData(episodeStart, episodeEnd) episodeStart = episodeEnd except StopIteration: episodeEnd = len(self.__seasonData) - 1 self.__processEpisodeData(episodeStart, episodeEnd) pass
def start(self): subClass = None while True: for col in self.colList: self.articles.clear() if col == "api_busan": subClass = Busan() print("\n\n부산\n\n") elif col == "api_herald": subClass = Herald() print("\n\n헤럴드\n\n") elif col == "api_nocut": subClass = Nocut() print("\n\n노컷\n\n") elif col == "api_ohmynews": subClass = Ohmynews() print("\n\n오마이\n\n") elif col == "api_wikitree": subClass = Wiki() print("\n\n위키\n\n") elif col == "api_donga": subClass = Donga() print("\n\n동아\n\n") elif col == "api_hangook": subClass = Hangook() print("\n\n한국\n\n") elif col == "api_joseon": subClass = Joseon() print("\n\n조선\n\n") elif col == "api_yeonhap": subClass = Yeonhap() print("\n\n연합\n\n") elif col == "api_joongang": subClass = Joongang() print("\n\n중앙\n\n") self.articles = subClass.crawling() subClass.quit() self.col = self.db[col] self.insertDB()
def getProjects(self): # Perhaps I should provide more API endpoints to make scraping easier... projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue() soup = BeautifulSoup(projectlist, "html.parser") links = soup.find("ul", "prjlistclass") projects = [] for link in links.find_all("a"): project = Project() sourceType = None projectURL = self.DOMAIN + link.get("href") projectName = projectURL.split("/")[-2] projectpageHTML = self.curl_get(projectURL).getvalue() projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") sourceURL = projectpageSoup.find(name="a", string="Source").get("href") sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser") sourceSoupText = sourceSoup.get_text() # get source if "git clone" in sourceSoupText: project.repoType = REPO_TYPES.git project.repoURL = "git://beta.datanethost.net/" + projectName + ".git" elif "svn co" in sourceSoupText: project.repoType = REPO_TYPES.SVN project.repoURL = self.DOMAIN + "/svn/" + projectName + "/" else: project.repoType = REPO_TYPES.hg project.repoURL = self.DOMAIN + "/hg/" + projectName + "/" # get downloads project.releases = [] downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser") downloadSection = downlaodsSoup.find("table", "uploads") if "No downloads were found." not in downlaodsSoup.get_text(): downloadRows = downloadSection.find_all("tr")[1:] for downloadRow in downloadRows: cols = downloadRow.find_all("td") downloadTD = cols[0] downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text fileName = downloadTD.a.text release = Release() release.fileURL = downloadURL release.fileName = fileName project.releases.append(release) # get issues project.issues = [] issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser") if "No issues were found." not in issuesSoup.get_text(): issuesSection = issuesSoup.find("table", "recent-issues") for issueRow in issuesSection.find_all("tr")[1:]: issue = Issue() cols = issueRow.find_all("td") issueId = cols[0].text issueURL = projectURL + "issues/" + issueId + "/" issueStatus = cols[2].text issueSummary = cols[1].text issueTitle = cols[1].find("a").text issueAuthor = cols[3].text issue.author = issueAuthor issue.comments = [] issue.status = issueStatus issue.summary = issueSummary issue.title = issueTitle issue.id = issueId # we must go deeper to get comments issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") for comment in issueComments.find_all("div", "issue-comment"): author = comment.find("p").get_text().split("by")[1].split(",")[0] date = comment.find("span").get_text() commentText = comment.find("pre").get_text() issueComment = IssueComment() issueComment.date = date issueComment.author = author issueComment.summary = commentText issue.comments.append(issueComment) project.issues.append(issue) # get wiki pages project.wikis = [] wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser") if "No documentation pages were found." not in wikiSoup.get_text(): wikiSection = wikiSoup.find("table", "recent-issues") for wikiRow in wikiSection.find_all("tr")[1:]: wiki = Wiki() cols = wikiRow.find_all("td") wiki.pageName = cols[0].text wiki.summary = cols[1].text wiki.updated = cols[2].text wikiURL = projectURL + "page/" + wiki.pageName + "/" wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") wikiContent = wikiPageSoup.find(id="wiki-content") wiki.htmlContent = wikiContent.prettify() wiki.textContent = wikiContent.get_text() project.wikis.append(wiki) projects.append(project) return projects
def getProject(self, projectName): project = Project() sourceType = None projectURL = self.DOMAIN + "/p/" + projectName + "/" projectpageHTML = self.curl_get(projectURL).getvalue() projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") sourceURL = projectpageSoup.find(name="a", string="Source").get("href") sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser") sourceSoupText = sourceSoup.get_text() # get source if "git clone" in sourceSoupText: project.repoType = REPO_TYPES.git project.repoURL = "https://code.google.com/p/" + projectName + "/" elif "svn co" in sourceSoupText: project.repoType = REPO_TYPES.SVN project.repoURL = "http://" + projectName + ".googlecode.com/svn/" else: project.repoType = REPO_TYPES.hg project.repoURL = "https://code.google.com/p/" + projectName + "/" # get downloads project.releases = [] downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser") downloadSection = downlaodsSoup.find("table", "results") if "Your search did not generate any results." not in downlaodsSoup.get_text(): downloadRows = downloadSection.find_all("tr")[1:] for downloadRow in downloadRows: cols = downloadRow.find_all("td") downloadTD = cols[1] downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ") fileName = downloadTD.a.text.replace("\n", "").strip(" ") release = Release() release.fileURL = downloadURL release.fileName = fileName project.releases.append(release) # get issues project.issues = [] issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser") if "Your search did not generate any results." not in issuesSoup.get_text(): issuesSection = issuesSoup.find("table", "results") for issueRow in issuesSection.find_all("tr")[1:]: issue = Issue() cols = issueRow.find_all("td") issueId = cols[1].text.replace("\n", "").strip() issueURL = projectURL + "issues/detail?id=" + issueId issueStatus = cols[3].text.replace("\n", "").strip(" ") issueSummary = cols[8].text.replace("\n", "") issueTitle = cols[8].text.replace("\n", "") issueAuthor = cols[5].text.replace("\n", "") #issue.author = issueAuthor issue.comments = [] issue.status = issueStatus.strip(" ") issue.summary = issueSummary.strip(" ") issue.title = issueTitle issue.id = issueId # we must go deeper to get comments issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") for comment in issueComments.find_all("div", "vt"): #author = comment.find(class_="author").find("a").text author = (comment.find(class_="author").find_all("a")[-1]).contents date = comment.find("span", "date")["title"] commentText = comment.find("pre").get_text() issueComment = IssueComment() issueComment.date = date issueComment.author = author issueComment.summary = commentText issue.comments.append(issueComment) project.issues.append(issue) # get wiki pages project.wikis = [] wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser") if "Your search did not generate any results." not in wikiSoup.get_text(): wikiSection = wikiSoup.find("table", "results") for wikiRow in wikiSection.find_all("tr")[1:]: wiki = Wiki() cols = wikiRow.find_all("td") wiki.pageName = cols[1].text.replace("\n", "").strip(" ") wiki.summary = cols[2].text.replace("\n", "").strip(" ") wiki.updated = cols[3].text.replace("\n", "").strip(" ") wikiURL = projectURL + "wiki/" + wiki.pageName wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") wikiContent = wikiPageSoup.find(id="wikicontent") wiki.htmlContent = wikiContent.prettify() wiki.textContent = wikiContent.get_text() project.wikis.append(wiki) return project
class Controller(ExtensionController): def __init__(self, *args, **kwargs): ExtensionController.__init__(self, *args, **kwargs) wikidir = os.path.join(self.api.get_data_dir(), 'warehouse') self.wiki = Wiki(self.api.get_db(), directory = wikidir, wiki_word_handler = self.__wiki_word_handler, wiki_url_handler = self.__wiki_url_handler) def __get_alias(self): page = self.api.get_requested_page() handle = page is not None and page.get_handle() return self.api.get_data().get_str('page') or handle def __get_page_name(self): page = self.api.get_requested_page() if self.api.get_data().get_str('page') is None and page is not None: return page.get_name() return split(self.__get_alias(), '/')[-1] def __get_user(self): # Find the name or IP of the current user. current_user = self.api.get_current_user() if current_user is not None: return current_user.get_handle() return os.environ["REMOTE_ADDR"] def __wiki_word_handler(self, url, word): alias = self.api.get_data().get_str('page') # The user is viewing the homepage of his web presence. if alias is None: url = self.api.get_requested_uri(page = word, revision = None, action = None) return (url, word) # The user is viewing a sub page of his web presence. Find out if it # is a sub-page of this wiki or the wiki homepage. page = self.api.get_requested_page() handle = page is not None and page.get_handle() pos = alias.find('/') wiki_home = pos == -1 and handle and handle == alias #print "WikiWord:", wiki_home, handle, alias, word # If the requested page is a sub-page of a wiki (i.e. not the wiki # home), build the alias by cutting the requested path and appending # the new component. stack = split(alias, '/') if wiki_home: stack.append(word) else: stack[-1] = word url = self.api.get_requested_uri(page = '/'.join(stack), revision = None, action = None) return (url, word) def __wiki_url_handler(self, url, word): if url.find(':') == -1: url = self.api.get_requested_uri(page = url, revision = None, action = None) return (url, word) def save(self, **kwargs): if kwargs.has_key('cancel'): return self.index() may_edit = self.api.current_user_may('edit_content') alias = self.__get_alias() page = WikiPage(self.wiki, alias) page.set_username(self.__get_user()) page.set_content(kwargs['wiki_markup']) # Save. if not may_edit: errors = [_('No permission to save this page.')] elif kwargs.get('wiki_markup', '') == '': errors = [_('No text was entered...')] elif not self.wiki.save_page(page): errors = [_('Sorry, there was an internal error.' \ + ' The page could not be saved.')] else: errors = [] # FIXME: We need more fine-grained control over what is and what isn't outdated. self.api.flush_cache(self.api_key) # Show. page = self.wiki.get_page(alias) self.api.render('show.tmpl', may_edit = may_edit, html = Markup(page.get_html()), errors = errors) def edit(self, **kwargs): may_edit = self.api.current_user_may('edit_content') name = self.__get_page_name() revision = kwargs.get('revision') page = self.wiki.get_page(self.__get_alias(), revision) errors = [] if not may_edit: errors.append(_('You are not allowed to edit this page.')) elif page is None: errors.append(_('You are editing a new page.')) # Show the page. self.api.render('edit.tmpl', name = name, may_edit = may_edit, wiki_markup = page.get_content(), errors = errors) def diff(self, **kwargs): may_edit = self.api.current_user_may('edit_content') name = self.__get_page_name() diff = self.wiki.get_diff(self.__get_alias(), kwargs.get('revision1'), kwargs.get('revision2')) self.api.render('diff.tmpl', name = name, may_edit = may_edit, diff = diff, errors = []) def history(self, **kwargs): may_edit = self.api.current_user_may('edit_content') name = self.__get_page_name() offset = kwargs.get('offset', 0) list = self.wiki.get_revision_list(self.__get_alias(), offset, 20) # Show the page. self.api.render('history.tmpl', name = name, revisions = list, may_edit = may_edit, errors = []) def index(self, **kwargs): may_edit = self.api.current_user_may('edit_content') revision = kwargs.get('revision') page = self.wiki.get_page(self.__get_alias(), revision) errors = [] if page is None: return self.edit(**kwargs) elif revision: errors.append(_('Showing old revision %s' % revision)) # Show the page. self.api.render('show.tmpl', may_edit = may_edit, html = Markup(page.get_html()), errors = errors)
import threading from Setup import Setup from Wiki import Wiki from Stackoverflow import Stackoverflow from Intent import Intent from MOOC import MOOC from HelloWorld import HelloWorld from Suggestions import Suggestions app = Flask(__name__) CORS(app) setup = Setup() info = Wiki() tags = Stackoverflow() intent = Intent() mooc = MOOC() helloworld = HelloWorld() suggestions = Suggestions() @app.route('/') def index(): return 'Hola! IIM' ''' @app.route('/getSetup') def getSetup():
from Wiki import Wiki from generator import hashlines countries = Wiki('countries.json', 'wikipedia.json') for item in countries: print(item) for item in hashlines('wikipedia.json'): print(item)