def go(name, date, cache=False): query_fmt = \ 'http://en.wikipedia.org/w/api.php/w/api.php?' \ 'action=query' \ '&format=json' \ '&prop=revisions' \ '&list=' \ '&pageids={}' \ '&rvsection=0' \ '&rvprop=timestamp%7Ccontent' \ '&rvstart={:04d}-{:02d}-{:02d}T00%3A00%3A00.000Z' # format the query timestamp = "{}-{}-{}".format(date.year, date.month, date.day) query = query_fmt.format(page_ids[name], date.year, date.month, date.day) sentiment_result = SentimentResult(name, page_ids[name], timestamp) # check to see if we actually need to perform the lookup if cache and sentiment_result.is_cached: sentiment_result.sync() return sentiment_result # if we don't have it in the cache, perform the query data = json.loads(requests.get(query).text) # parse the result with BeautifulSoup wiki_markup = data['query']['pages'][str(page_ids[name])]['revisions'][0]['*'] def format(text): lines = text.split('\n') return ' '.join([i for i in lines if i][1:-1]) # extract readable text from the markup extractor = WikiExtractor.Extractor(page_ids[name], 0, name, wiki_markup) sio = io.StringIO() extractor.extract(sio) sio.seek(0) text = format(sio.read()) # score the result with Google's sentiment analysis score, magnitude = analyze(text) sentiment_result.score = score sentiment_result.magnitude = magnitude sentiment_result.length = len(text) # cache to a file, if necessary if cache: sentiment_result.cache() return sentiment_result
def renderRevision(rev, title): """Renders revision dictionary in HTML/WikiMarkup into plaintext. TODO Html conversion!""" if (rev["*"] != None): if (rev["format"] == "wikimarkup"): text = rev["*"] out = io.StringIO() extractor = WikiExtractor.Extractor(0, 0, title, text.split("\n")) extractor.extract(out) rev["*"] = out.getvalue() out.close() rev = splitBySentences(rev) rev["format"] = "plaintext" return rev else: return rev else: return rev
def cleanArticle(inCollection, outCollection): lastId = -1 if outCollection.count() != 0: lastId = outCollection.find().sort([("_id", pymongo.DESCENDING)]).limit(1)[0]["_id"] print "Starting from id greater than: {}".format(lastId) sys.stdout.flush() numCleaned = 0 for article in inCollection.find({"_id": {"$gt": lastId}}).sort([("_id", pymongo.ASCENDING)]): # Parse it. extractor = WikiExtractor.Extractor(article["_id"], article["title"], [article["text"]]) article["text"] = extractor.clean() outCollection.insert_one(article) # Print progress. numCleaned += 1 if numCleaned % 1000 == 0: print "Cleaned {} articles so far...".format(numCleaned) sys.stdout.flush() return numCleaned
def words(self, normalise=False, strict_words=True, lowercase=False): #? ! . ?" !" ." ?'' !'' .'' sentence_end_re = re.compile( u"(?:\.|\?|!|\.''|\?''|!''|\?\"|!\"|\.\")$", re.U) class outter(object): def __init__(self): self.ls = [] def write(self, l): self.ls.append(l) def text(self): return u"".join(self.ls[1:-1]) pages = 0 for i, (id, title, page) in enumerate(self.pages()): pages += 1 out = outter() WikiExtractor.Extractor(id, title, page).extract(out) lastw = None for w in out.text().split(): wnorm = w # special case ==Zdroje if lastw is None or sentence_end_re.search(lastw): sentence_start = True else: sentence_start = False if not sentence_start: if w.startswith("==") or lastw.endswith("=="): sentence_start = True if normalise: wnorm = self.normalise(w, True, False) if strict_words: if wnorm.isupper() or wnorm.isnumeric(): wnorm = "" else: wnorm1 = self.normalise(wnorm, False, True) if len(wnorm1) != len(wnorm): wnorm = "" if lowercase and 0 < len(wnorm): wnorm = wnorm.lower() # TODO debug # if wnorm in( # u"Má", # ): # if sentence_start: # pass # else: # pass if 0 == len(wnorm): lastw = w continue if not sentence_start and w[0].isupper(): pass if sentence_start and not w[0].isupper(): pass yield w, wnorm, sentence_start, pages lastw = w
text = wiki._normalise_re_apos1.sub(ur'\1"', text) text = wiki._normalise_re_apos2.sub(ur'"\1', text) text = wiki._normalise_re_apos3.sub(ur'"', text) text = wiki._normalise_re_non_letter_start.sub(ur'', text) text = wiki._normalise_re_non_letter_end.sub(ur'', text) if inner: text = wiki._normalise_re_non_letter.sub(ur'', text) return text if __name__ == '__main__': w = wiki("../skwiki-20151226-pages-articles.xml") class outter(object): def __init__(self): self.ls = [] def write(self, l): self.ls.append(l) def text(self): return "".join(self.ls[1:-1]) for i, (id, title, page) in enumerate(w.pages()): out = outter() WikiExtractor.Extractor(id, title, page).extract(out) print out.text() if i > 5000: break