Ejemplo n.º 1
0
def go(name, date, cache=False):

    query_fmt = \
        'http://en.wikipedia.org/w/api.php/w/api.php?' \
        'action=query'    \
        '&format=json'    \
        '&prop=revisions' \
        '&list='          \
        '&pageids={}'     \
        '&rvsection=0'    \
        '&rvprop=timestamp%7Ccontent' \
        '&rvstart={:04d}-{:02d}-{:02d}T00%3A00%3A00.000Z'

    # format the query
    timestamp = "{}-{}-{}".format(date.year, date.month, date.day)
    query = query_fmt.format(page_ids[name], date.year, date.month, date.day)
    sentiment_result = SentimentResult(name, page_ids[name], timestamp)

    # check to see if we actually need to perform the lookup
    if cache and sentiment_result.is_cached:
        sentiment_result.sync()
        return sentiment_result
    
    # if we don't have it in the cache, perform the query
    data  = json.loads(requests.get(query).text)

    # parse the result with BeautifulSoup
    wiki_markup  = data['query']['pages'][str(page_ids[name])]['revisions'][0]['*']
    
    def format(text):
        lines = text.split('\n')
        return ' '.join([i for i in lines if i][1:-1])

    # extract readable text from the markup
    extractor = WikiExtractor.Extractor(page_ids[name], 0, name, wiki_markup)
    sio = io.StringIO()
    extractor.extract(sio)
    sio.seek(0)
    text = format(sio.read())
    
    # score the result with Google's sentiment analysis
    score, magnitude = analyze(text)
    sentiment_result.score = score
    sentiment_result.magnitude = magnitude
    sentiment_result.length = len(text)
    
    # cache to a file, if necessary
    if cache: sentiment_result.cache()

    return sentiment_result
Ejemplo n.º 2
0
def renderRevision(rev, title):
    """Renders revision dictionary in HTML/WikiMarkup into plaintext. TODO Html conversion!"""

    if (rev["*"] != None):
        if (rev["format"] == "wikimarkup"):
            text = rev["*"]
            out = io.StringIO()
            extractor = WikiExtractor.Extractor(0, 0, title, text.split("\n"))
            extractor.extract(out)
            rev["*"] = out.getvalue()
            out.close()
            rev = splitBySentences(rev)
            rev["format"] = "plaintext"
            return rev
        else:
            return rev
    else:
        return rev
Ejemplo n.º 3
0
def cleanArticle(inCollection, outCollection):
    lastId = -1
    if outCollection.count() != 0:
        lastId = outCollection.find().sort([("_id", pymongo.DESCENDING)]).limit(1)[0]["_id"]
        print "Starting from id greater than: {}".format(lastId)
        sys.stdout.flush()
    numCleaned = 0
    for article in inCollection.find({"_id": {"$gt": lastId}}).sort([("_id", pymongo.ASCENDING)]):
        # Parse it.
        extractor = WikiExtractor.Extractor(article["_id"], article["title"], [article["text"]])
        article["text"] = extractor.clean()
        outCollection.insert_one(article)

        # Print progress.
        numCleaned += 1
        if numCleaned % 1000 == 0:
            print "Cleaned {} articles so far...".format(numCleaned)
            sys.stdout.flush()

    return numCleaned
Ejemplo n.º 4
0
    def words(self, normalise=False, strict_words=True, lowercase=False):

        #? ! . ?" !" ." ?'' !'' .''
        sentence_end_re = re.compile(
            u"(?:\.|\?|!|\.''|\?''|!''|\?\"|!\"|\.\")$", re.U)

        class outter(object):
            def __init__(self):
                self.ls = []

            def write(self, l):
                self.ls.append(l)

            def text(self):
                return u"".join(self.ls[1:-1])

        pages = 0
        for i, (id, title, page) in enumerate(self.pages()):
            pages += 1
            out = outter()
            WikiExtractor.Extractor(id, title, page).extract(out)
            lastw = None
            for w in out.text().split():
                wnorm = w

                # special case ==Zdroje
                if lastw is None or sentence_end_re.search(lastw):
                    sentence_start = True
                else:
                    sentence_start = False
                if not sentence_start:
                    if w.startswith("==") or lastw.endswith("=="):
                        sentence_start = True

                if normalise:
                    wnorm = self.normalise(w, True, False)

                if strict_words:
                    if wnorm.isupper() or wnorm.isnumeric():
                        wnorm = ""
                    else:
                        wnorm1 = self.normalise(wnorm, False, True)
                        if len(wnorm1) != len(wnorm):
                            wnorm = ""
                    if lowercase and 0 < len(wnorm):
                        wnorm = wnorm.lower()
                # TODO debug
                # if wnorm in(
                #         u"Má",
                # ):
                #     if sentence_start:
                #         pass
                #     else:
                #         pass
                if 0 == len(wnorm):
                    lastw = w
                    continue
                if not sentence_start and w[0].isupper():
                    pass
                if sentence_start and not w[0].isupper():
                    pass
                yield w, wnorm, sentence_start, pages
                lastw = w
Ejemplo n.º 5
0
            text = wiki._normalise_re_apos1.sub(ur'\1"', text)
            text = wiki._normalise_re_apos2.sub(ur'"\1', text)
            text = wiki._normalise_re_apos3.sub(ur'"', text)
            text = wiki._normalise_re_non_letter_start.sub(ur'', text)
            text = wiki._normalise_re_non_letter_end.sub(ur'', text)
        if inner:
            text = wiki._normalise_re_non_letter.sub(ur'', text)

        return text


if __name__ == '__main__':
    w = wiki("../skwiki-20151226-pages-articles.xml")

    class outter(object):
        def __init__(self):
            self.ls = []

        def write(self, l):
            self.ls.append(l)

        def text(self):
            return "".join(self.ls[1:-1])

    for i, (id, title, page) in enumerate(w.pages()):
        out = outter()
        WikiExtractor.Extractor(id, title, page).extract(out)
        print out.text()
        if i > 5000:
            break