Exemple #1
0
def gradeWiki(partId, ch_aux):
    import sys
    original_stdout = sys.stdout
    sys.stdout = NullDevice()
    wiki = Wiki()
    if partId == 1:
        wivesFile = '../data/wives.txt'
        goldFile = '../data/gold.txt'
        wives = wiki.addWives(wivesFile)
    else:
        wives = ch_aux.split('\n')
        wives = wives[1:]
    wikiFile = '../data/small-wiki.xml'
    infoBoxHusbands = wiki.processFile(open(wikiFile), wives, True)
    modifyWiki(ch_aux[0], wikiFile)
    mod_file = open('wiki_mod')
    noInfoHusbands = wiki.processFile(mod_file, wives, False)
    #os.remove('wiki_mod')
    sys.stdout = original_stdout
    if partId == 1:
        infoScore = evaluateAnswers(infoBoxHusbands, goldFile)
        noInfoScore = evaluateAnswers(noInfoHusbands, goldFile)
        return (infoScore + noInfoScore)
    else:
        return encodeWiki(infoBoxHusbands, noInfoHusbands)
Exemple #2
0
def gradeWiki(partId, ch_aux):
    import sys
    original_stdout = sys.stdout
    sys.stdout = NullDevice()
    wiki = Wiki()
    if partId == 1:
        wivesFile = '../data/wives.txt'
        goldFile = '../data/gold.txt'
        wives = wiki.addWives(wivesFile)
    else:
        wives = ch_aux.split('\n')
        wives = wives[1:]
    wikiFile = '../data/small-wiki.xml'
    infoBoxHusbands = wiki.processFile(open(wikiFile), wives, True)
    modifyWiki(ch_aux[0], wikiFile)
    mod_file = open('wiki_mod')
    noInfoHusbands = wiki.processFile(mod_file, wives, False)
    #os.remove('wiki_mod')
    sys.stdout = original_stdout
    if partId == 1:
        infoScore = evaluateAnswers(infoBoxHusbands, goldFile)
        noInfoScore = evaluateAnswers(noInfoHusbands, goldFile)
        return (infoScore + noInfoScore)
    else:
        return encodeWiki(infoBoxHusbands, noInfoHusbands)
Exemple #3
0
 def __init__(self, *args, **kwargs):
     ExtensionController.__init__(self, *args, **kwargs)
     wikidir   = os.path.join(self.api.get_data_dir(), 'warehouse')
     self.wiki = Wiki(self.api.get_db(),
                      directory         = wikidir,
                      wiki_word_handler = self.__wiki_word_handler,
                      wiki_url_handler  = self.__wiki_url_handler)
Exemple #4
0
 def __processEpisodes(self):
     import re
     from Episode import Episode
     from Wiki import Wiki, WikiLookupError
     
     seasonData = self.__seasonData
     
     deeplink = self.__getSeasonDeeplink()
     if (deeplink != None):
         wiki = Wiki()
         seasonData = wiki.query(deeplink)
         self.__seasonData = seasonData
         
     episodeRegex = re.compile('{{Episode list', re.IGNORECASE)
     foundEpisodes = episodeRegex.finditer(seasonData)
     
     if (foundEpisodes == None):
         raise WikiLookupError('Could not find \'{{Episode List\' tag')
 
     episodeStart = 0
     try:
         episodeIndex = foundEpisodes.next()
         episodeStart = episodeIndex.start()
         
         while (1):
             episodeIndex = foundEpisodes.next()
             episodeEnd = episodeIndex.start()
             
             self.__processEpisodeData(episodeStart, episodeEnd)
             
             episodeStart = episodeEnd
         
         
     except StopIteration:
         episodeEnd = len(self.__seasonData) - 1
         self.__processEpisodeData(episodeStart, episodeEnd)
         pass
Exemple #5
0
    def start(self):
        subClass = None
        while True:
            for col in self.colList:
                self.articles.clear()

                if col == "api_busan":
                    subClass = Busan()
                    print("\n\n부산\n\n")
                elif col == "api_herald":
                    subClass = Herald()
                    print("\n\n헤럴드\n\n")
                elif col == "api_nocut":
                    subClass = Nocut()
                    print("\n\n노컷\n\n")
                elif col == "api_ohmynews":
                    subClass = Ohmynews()
                    print("\n\n오마이\n\n")
                elif col == "api_wikitree":
                    subClass = Wiki()
                    print("\n\n위키\n\n")
                elif col == "api_donga":
                    subClass = Donga()
                    print("\n\n동아\n\n")
                elif col == "api_hangook":
                    subClass = Hangook()
                    print("\n\n한국\n\n")
                elif col == "api_joseon":
                    subClass = Joseon()
                    print("\n\n조선\n\n")
                elif col == "api_yeonhap":
                    subClass = Yeonhap()
                    print("\n\n연합\n\n")
                elif col == "api_joongang":
                    subClass = Joongang()
                    print("\n\n중앙\n\n")

                self.articles = subClass.crawling()
                subClass.quit()

                self.col = self.db[col]
                self.insertDB()
Exemple #6
0
    def getProjects(self):
        # Perhaps I should provide more API endpoints to make scraping easier...
        projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue()
        soup = BeautifulSoup(projectlist, "html.parser")
        links = soup.find("ul", "prjlistclass")
        projects = []
        for link in links.find_all("a"):
            project = Project()
            sourceType = None
            projectURL = self.DOMAIN + link.get("href")
            projectName = projectURL.split("/")[-2]

            projectpageHTML = self.curl_get(projectURL).getvalue()
            projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")

            sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
            sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser")
            sourceSoupText = sourceSoup.get_text()

            # get source
            if "git clone" in sourceSoupText:
                project.repoType = REPO_TYPES.git
                project.repoURL = "git://beta.datanethost.net/" + projectName + ".git"
            elif "svn co" in sourceSoupText:
                project.repoType = REPO_TYPES.SVN
                project.repoURL = self.DOMAIN + "/svn/" + projectName + "/"
            else:
                project.repoType = REPO_TYPES.hg
                project.repoURL = self.DOMAIN + "/hg/" + projectName + "/"


            # get downloads
            project.releases = []
            downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser")
            downloadSection = downlaodsSoup.find("table", "uploads")
            if "No downloads were found." not in downlaodsSoup.get_text():
                downloadRows = downloadSection.find_all("tr")[1:]
                for downloadRow in downloadRows:
                    cols = downloadRow.find_all("td")
                    downloadTD = cols[0]
                    downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text
                    fileName = downloadTD.a.text
                    release = Release()
                    release.fileURL = downloadURL
                    release.fileName = fileName
                    project.releases.append(release)

            # get issues
            project.issues = []
            issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser")
            if "No issues were found." not in issuesSoup.get_text():
                issuesSection = issuesSoup.find("table", "recent-issues")
                for issueRow in issuesSection.find_all("tr")[1:]:
                    issue = Issue()
                    cols = issueRow.find_all("td")
                    issueId = cols[0].text
                    issueURL = projectURL + "issues/" + issueId + "/"
                    issueStatus = cols[2].text
                    issueSummary = cols[1].text
                    issueTitle = cols[1].find("a").text
                    issueAuthor = cols[3].text
                    issue.author = issueAuthor
                    issue.comments = []
                    issue.status = issueStatus
                    issue.summary = issueSummary
                    issue.title = issueTitle
                    issue.id = issueId
                    # we must go deeper to get comments
                    issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
                    for comment in issueComments.find_all("div", "issue-comment"):
                        author = comment.find("p").get_text().split("by")[1].split(",")[0]
                        date = comment.find("span").get_text()
                        commentText = comment.find("pre").get_text()
                        issueComment = IssueComment()
                        issueComment.date = date
                        issueComment.author = author
                        issueComment.summary = commentText
                        issue.comments.append(issueComment)

                    project.issues.append(issue)

            # get wiki pages
            project.wikis = []
            wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser")
            if "No documentation pages were found." not in wikiSoup.get_text():
                wikiSection = wikiSoup.find("table", "recent-issues")
                for wikiRow in wikiSection.find_all("tr")[1:]:
                    wiki = Wiki()
                    cols = wikiRow.find_all("td")
                    wiki.pageName = cols[0].text
                    wiki.summary = cols[1].text
                    wiki.updated = cols[2].text
                    wikiURL = projectURL + "page/" + wiki.pageName + "/"
                    wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
                    wikiContent = wikiPageSoup.find(id="wiki-content")
                    wiki.htmlContent = wikiContent.prettify()
                    wiki.textContent = wikiContent.get_text()
                    project.wikis.append(wiki)


            projects.append(project)

        return projects
Exemple #7
0
    def getProject(self, projectName):
        project = Project()
        sourceType = None
        projectURL = self.DOMAIN + "/p/" + projectName + "/"

        projectpageHTML = self.curl_get(projectURL).getvalue()
        projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")

        sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
        sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
        sourceSoupText = sourceSoup.get_text()

        # get source
        if "git clone" in sourceSoupText:
            project.repoType = REPO_TYPES.git
            project.repoURL = "https://code.google.com/p/" + projectName + "/"
        elif "svn co" in sourceSoupText:
            project.repoType = REPO_TYPES.SVN
            project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
        else:
            project.repoType = REPO_TYPES.hg
            project.repoURL = "https://code.google.com/p/" + projectName + "/"


        # get downloads
        project.releases = []
        downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
        downloadSection = downlaodsSoup.find("table", "results")
        if "Your search did not generate any results." not in downlaodsSoup.get_text():
            downloadRows = downloadSection.find_all("tr")[1:]
            for downloadRow in downloadRows:
                cols = downloadRow.find_all("td")
                downloadTD = cols[1]
                downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
                fileName = downloadTD.a.text.replace("\n", "").strip(" ")
                release = Release()
                release.fileURL = downloadURL
                release.fileName = fileName
                project.releases.append(release)

        # get issues
        project.issues = []
        issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
        if "Your search did not generate any results." not in issuesSoup.get_text():
            issuesSection = issuesSoup.find("table", "results")
            for issueRow in issuesSection.find_all("tr")[1:]:
                issue = Issue()
                cols = issueRow.find_all("td")
                issueId = cols[1].text.replace("\n", "").strip()
                issueURL = projectURL + "issues/detail?id=" + issueId
                issueStatus = cols[3].text.replace("\n", "").strip(" ")
                issueSummary = cols[8].text.replace("\n", "")
                issueTitle = cols[8].text.replace("\n", "")
                issueAuthor = cols[5].text.replace("\n", "")

                #issue.author = issueAuthor
                issue.comments = []
                issue.status = issueStatus.strip(" ")
                issue.summary = issueSummary.strip(" ")
                issue.title = issueTitle
                issue.id = issueId

                # we must go deeper to get comments
                issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
                for comment in issueComments.find_all("div", "vt"):
                    #author = comment.find(class_="author").find("a").text
                    author = (comment.find(class_="author").find_all("a")[-1]).contents
                    date = comment.find("span", "date")["title"]
                    commentText = comment.find("pre").get_text()
                    issueComment = IssueComment()
                    issueComment.date = date
                    issueComment.author = author
                    issueComment.summary = commentText
                    issue.comments.append(issueComment)

                project.issues.append(issue)

        # get wiki pages
        project.wikis = []
        wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
        if "Your search did not generate any results." not in wikiSoup.get_text():
            wikiSection = wikiSoup.find("table", "results")
            for wikiRow in wikiSection.find_all("tr")[1:]:
                wiki = Wiki()
                cols = wikiRow.find_all("td")
                wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
                wiki.summary = cols[2].text.replace("\n", "").strip(" ")
                wiki.updated = cols[3].text.replace("\n", "").strip(" ")
                wikiURL = projectURL + "wiki/" + wiki.pageName
                wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
                wikiContent = wikiPageSoup.find(id="wikicontent")
                wiki.htmlContent = wikiContent.prettify()
                wiki.textContent = wikiContent.get_text()
                project.wikis.append(wiki)

        return project
Exemple #8
0
class Controller(ExtensionController):
    def __init__(self, *args, **kwargs):
        ExtensionController.__init__(self, *args, **kwargs)
        wikidir   = os.path.join(self.api.get_data_dir(), 'warehouse')
        self.wiki = Wiki(self.api.get_db(),
                         directory         = wikidir,
                         wiki_word_handler = self.__wiki_word_handler,
                         wiki_url_handler  = self.__wiki_url_handler)


    def __get_alias(self):
        page   = self.api.get_requested_page()
        handle = page is not None and page.get_handle()
        return self.api.get_data().get_str('page') or handle


    def __get_page_name(self):
        page = self.api.get_requested_page()
        if self.api.get_data().get_str('page') is None and page is not None:
            return page.get_name()
        return split(self.__get_alias(), '/')[-1]


    def __get_user(self):
        # Find the name or IP of the current user.
        current_user = self.api.get_current_user()
        if current_user is not None:
            return current_user.get_handle()
        return os.environ["REMOTE_ADDR"]


    def __wiki_word_handler(self, url, word):
        alias = self.api.get_data().get_str('page')

        # The user is viewing the homepage of his web presence.
        if alias is None:
            url = self.api.get_requested_uri(page     = word,
                                             revision = None,
                                             action   = None)
            return (url, word)

        # The user is viewing a sub page of his web presence. Find out if it
        # is a sub-page of this wiki or the wiki homepage.
        page      = self.api.get_requested_page()
        handle    = page is not None and page.get_handle()
        pos       = alias.find('/')
        wiki_home = pos == -1 and handle and handle == alias
        #print "WikiWord:", wiki_home, handle, alias, word

        # If the requested page is a sub-page of a wiki (i.e. not the wiki
        # home), build the alias by cutting the requested path and appending
        # the new component.
        stack = split(alias, '/')
        if wiki_home:
            stack.append(word)
        else:
            stack[-1] = word
        url = self.api.get_requested_uri(page     = '/'.join(stack),
                                         revision = None,
                                         action   = None)
        return (url, word)


    def __wiki_url_handler(self, url, word):
        if url.find(':') == -1:
            url = self.api.get_requested_uri(page     = url,
                                             revision = None,
                                             action   = None)
        return (url, word)


    def save(self, **kwargs):
        if kwargs.has_key('cancel'):
            return self.index()

        may_edit = self.api.current_user_may('edit_content')
        alias    = self.__get_alias()
        page     = WikiPage(self.wiki, alias)
        page.set_username(self.__get_user())
        page.set_content(kwargs['wiki_markup'])
        
        # Save.
        if not may_edit:
            errors = [_('No permission to save this page.')]
        elif kwargs.get('wiki_markup', '') == '':
            errors = [_('No text was entered...')]
        elif not self.wiki.save_page(page):
            errors = [_('Sorry, there was an internal error.' \
                         + ' The page could not be saved.')]
        else:
            errors = []

        # FIXME: We need more fine-grained control over what is and what isn't outdated.
        self.api.flush_cache(self.api_key)

        # Show.
        page = self.wiki.get_page(alias)
        self.api.render('show.tmpl',
                        may_edit = may_edit,
                        html     = Markup(page.get_html()),
                        errors   = errors)


    def edit(self, **kwargs):
        may_edit = self.api.current_user_may('edit_content')
        name     = self.__get_page_name()
        revision = kwargs.get('revision')
        page     = self.wiki.get_page(self.__get_alias(), revision)

        errors = []
        if not may_edit:
            errors.append(_('You are not allowed to edit this page.'))
        elif page is None:
            errors.append(_('You are editing a new page.'))

        # Show the page.
        self.api.render('edit.tmpl',
                        name        = name,
                        may_edit    = may_edit,
                        wiki_markup = page.get_content(),
                        errors      = errors)


    def diff(self, **kwargs):
        may_edit = self.api.current_user_may('edit_content')
        name     = self.__get_page_name()
        diff     = self.wiki.get_diff(self.__get_alias(),
                                      kwargs.get('revision1'),
                                      kwargs.get('revision2'))
        self.api.render('diff.tmpl',
                        name     = name,
                        may_edit = may_edit,
                        diff     = diff,
                        errors   = [])


    def history(self, **kwargs):
        may_edit = self.api.current_user_may('edit_content')
        name     = self.__get_page_name()
        offset   = kwargs.get('offset', 0)
        list     = self.wiki.get_revision_list(self.__get_alias(), offset, 20)
        
        # Show the page.
        self.api.render('history.tmpl',
                        name      = name,
                        revisions = list,
                        may_edit  = may_edit,
                        errors    = [])


    def index(self, **kwargs):
        may_edit = self.api.current_user_may('edit_content')
        revision = kwargs.get('revision')
        page     = self.wiki.get_page(self.__get_alias(), revision)

        errors = []
        if page is None:
            return self.edit(**kwargs)
        elif revision:
            errors.append(_('Showing old revision %s' % revision))

        # Show the page.
        self.api.render('show.tmpl',
                        may_edit = may_edit,
                        html     = Markup(page.get_html()),
                        errors   = errors)
Exemple #9
0
import threading

from Setup import Setup
from Wiki import Wiki
from Stackoverflow import Stackoverflow
from Intent import Intent
from MOOC import MOOC
from HelloWorld import HelloWorld
from Suggestions import Suggestions

app = Flask(__name__)
CORS(app)

setup = Setup()
info = Wiki()
tags = Stackoverflow()
intent = Intent()
mooc = MOOC()
helloworld = HelloWorld()
suggestions = Suggestions()


@app.route('/')
def index():
    return 'Hola! IIM'


'''
@app.route('/getSetup')
def getSetup():
Exemple #10
0
from Wiki import Wiki
from generator import hashlines

countries = Wiki('countries.json', 'wikipedia.json')
for item in countries:
    print(item)

for item in hashlines('wikipedia.json'):
    print(item)