def harvest(thesaurus): p = page.Page(WIKTIONNAIRE, thesaurusPrefix+thesaurus+fr) if p.exists and thesaurus not in cache["thesaurus"]: print thesaurus.upper() cache["thesaurus"][thesaurus]=[] text = page.Page(WIKTIONNAIRE, thesaurusPrefix+thesaurus+fr).text wikiWords = [x[2:len(x)-2] for x in link.findall(text)] for wikiWord in wikiWords: if sources(wikiWord): cache["thesaurus"][thesaurus].append(wikiWord) write()
def harvest(thesaurus): """Harvest information about a given Thesaurus and save cache""" LOG.info("Harvesting: %s", thesaurus) p = page.Page(WIKTIONNAIRE, thesaurusPrefix + thesaurus + fr) if p.exists and thesaurus not in CACHE["thesaurus"]: LOG.debug(thesaurus.upper()) CACHE["thesaurus"][thesaurus] = [] text = page.Page(WIKTIONNAIRE, thesaurusPrefix + thesaurus + fr).text wikiWords = [x[2:len(x) - 2] for x in link.findall(text)] for wikiWord in wikiWords: if sources(wikiWord): CACHE["thesaurus"][thesaurus].append(wikiWord) cache.save(CACHE)
def update(): LOG.info("updating Wikimedia projects") data = request.args.get('data', 0, type=str) d = ast.literal_eval(data) for cluster in d: if cluster["id"] is not "unclustered" and len(cluster["images"]) > 0: cluster["images"] = [idMap[img] for img in cluster["images"]] fusion_cat([page.Page(COMMONS, img) for img in cluster["images"]]) return render_template('result.html', **result)
def gender(author): result = [] wikiArticle = page.Page(WIKIPEDIA, author) if wikiArticle.exists(): item = wikiArticle.data_item() if gender in item.claims: for claim in item.claims[gender]: if claim.getTarget() is not None: result.add(claim.getTarget().id) return {author:{genders:result}}
def categories(p, height=0): if p not in categories_tree or "Parents" not in categories_tree[p]: categories_tree[p] = { "Parents": [c.title() for c in page.Page(COMMONS, p).categories()] } cats = set(categories_tree[p]["Parents"]) if height is 0: return cats else: temp = set(cats) for cat in cats: temp |= categories(cat, height - 1) return temp
def visualize(category_name, clusters): test_page = page.Page(COMMONS, "User:Donna Nobot/clusterArtworks") stringBuffer = [test_page.text] stringBuffer.append("\n\n== [[:Category:") stringBuffer.append(category_name) stringBuffer.append("|") stringBuffer.append(category_name) stringBuffer.append("]] ==\n") for cluster in clusters: stringBuffer.append("<gallery mode=\"packed\">\n") for file in cluster: stringBuffer.append(file) stringBuffer.append("\n") stringBuffer.append("</gallery>\n\n") test_page.put("".join(stringBuffer), "#clusterArworks")
def characteristics(author): print "\t"+author result={} wikiArticle = page.Page(WIKIPEDIA, author) if wikiArticle.exists(): while wikiArticle.isRedirectPage(): wikiArticle = wikiArticle.getRedirectTarget() item = wikiArticle.data_item() for data in datas: if data in item.claims: for claim in item.claims[data]: if claim.getTarget() is not None: if data is birthDate: result[data]=claim.getTarget().year else: result[data]=claim.getTarget().id return result
def sources(word): result={ 'Authors'=[] } text = page.Page(WIKTIONNAIRE, word).text templates = source.findall(text) for template in templates: #Authors are linked to wikipedia wikiAuthors = wAuthor.findall() for wikiAuthor in wikiAuthors: if wikiAuthor not in result["authors"]: result["authors"].add(wikiAuthor) genders = gender(wikiAuthor) if wikiAuthor not in cache["authors"]: cache["authors"][wikiAuthor]=genders else: cache["authors"][wikiAuthor].addAll(genders) return {word:result}
def characteristics(author): """Find characteristics for an author on Wikidata.""" LOG.debug("\t%s", author) result = {} wikiArticle = page.Page(WIKIPEDIA, author) if wikiArticle.exists(): while wikiArticle.isRedirectPage(): wikiArticle = wikiArticle.getRedirectTarget() item = wikiArticle.data_item() for data in datas: if data in item.claims: for claim in item.claims[data]: if claim.getTarget() is not None: if data is birthDate: result[data] = claim.getTarget().year else: result[data] = claim.getTarget().id return result
def sources(word): print word p = page.Page(WIKTIONNAIRE, word) if p.exists: cache[words][word]=[] text = p.text templates = source.findall(text) for template in templates: #Authors are linked to wikipedia wikiAuthors = [x[4:len(x)-2] for x in wAuthor.findall(template)] for wikiAuthor in wikiAuthors: if wikiAuthor not in cache[authors]: cache[authors][wikiAuthor] = characteristics(wikiAuthor) cache[authors][wikiAuthor][words] = [] cache[words][word].append(wikiAuthor) cache[authors][wikiAuthor][words].append(word) return True else: return False
def sources(word): """Find sources information for word.""" LOG.debug(word) p = page.Page(WIKTIONNAIRE, word) if p.exists: CACHE[words][word] = [] text = p.text templates = source.findall(text) for template in templates: # Authors are linked to wikipedia wikiAuthors = [x[4:len(x) - 2] for x in wAuthor.findall(template)] wikiAuthors += [ x[0] + " " + x[1] for x in nomWAuthor.findall(template) ] for wikiAuthor in wikiAuthors: if wikiAuthor not in CACHE[authors]: CACHE[authors][wikiAuthor] = characteristics(wikiAuthor) CACHE[authors][wikiAuthor][words] = [] CACHE[words][word].append(wikiAuthor) CACHE[authors][wikiAuthor][words].append(word) return True else: return False
def scan_list(pages, new_description, reorder_sections, must_be_part): apply_all = False for article in pages: is_english = not NOT_ENGLISH.match(article.title()) quote_parameter = ENGLISH_QUOTE_PARAMETER if is_english else NOT_ENGLISH_QUOTE_PARAMETER print("======================================") print("Working on: {}".format(article.title())) parsed = mwparserfromhell.parse(article.text) comment = { "desc title": 0, "desc to quote": 0, } comments = [] is_part = False for template in parsed.filter_templates(): template_name = template.name.lower().strip() if template_name == "description": template.name = "Quote" _minc(comment, "desc to quote") elif template_name.find("infobox/part") >= 0 or template_name.find( "partbox") >= 0: is_part = True print( "NOTE: Page '{}' does not use outsourced infobox template." ) else: box_page_match = BOX_TEMPLATE.match(template.name.strip()) if box_page_match: # read box template box_page = page.Page(site, box_page_match.group(1)) for box_template in mwparserfromhell.parse( box_page.text).filter_templates(): box_template_name = box_template.name.lower().strip() if box_template_name.find( "infobox/part") >= 0 or box_template_name.find( "partbox") >= 0: if box_template_name.find("partbox") >= 0: print("NOTE: Page '{}' uses Partbox".format( box_page.title())) is_part = True break # ONLY handle part pages (not pages like Part) if not is_part and must_be_part: print("NOTE: Skipped '{}' because is not part page.".format( article.title())) continue if is_english: for heading in parsed.filter_headings(): old = heading.title if heading.title.strip() != new_description and is_description( heading.title): heading.title = new_description if old != heading.title: _minc(comment, "desc title") # only read == .. == sections and not omit the first section sections = parsed.get_sections(levels=[2], include_lead=True) if reorder_sections: # quick and dirty workaround to have the footer in the last section last_section = unicode(sections[-1]) footer_start = last_section.find("{{Parts}}") if last_section[footer_start - 1] == "\n" and last_section[footer_start - 2] == "\n": while last_section[footer_start - 3] == "\n": footer_start -= 1 if footer_start < 0: print("ERROR: No {{Parts}} in last section") else: footer_less = last_section[:footer_start] footer = last_section[footer_start:] sections[-1] = mwparserfromhell.parse(footer_less) # the next line is not part of the workaround sorted_sections = sorted(sections, key=get_order) if footer_start >= 0: last_section = unicode(sorted_sections[-1]) if last_section[-1] != "\n": last_section += "\n" last_section += footer sorted_sections[-1] = mwparserfromhell.parse(last_section) # workaround ends here for i in range(0, len(sections)): if i >= len(sorted_sections) or get_heading( sections[i]) != get_heading(sorted_sections[i]): comments += ["*sorted order of sections;"] break sections = sorted_sections new_text = "" for section in sections: heading = get_heading(section) if heading: heading.title = " {} ".format(heading.title.strip()) if is_description(heading.title): for template in section.filter_templates(): if template.name.lower().strip() == "quote": author = None for param in template.params: if param.name == "2": author = param elif param.name not in quote_parameter: break else: if author: template.remove(author) comments += [ "-removed author for description;" ] break #only change the first quote template! if reorder_sections: new_text += "{}".format(section) if not reorder_sections: new_text = "{}".format(parsed) if comment["desc title"]: comments += [ "*updated {} description title{};".format( *_plural(comment["desc title"])) ] if comment["desc to quote"]: comments += [ "*replaced {0} description template{1} with quote template{1};" .format(*_plural(comment["desc to quote"])) ] if comments: pywikibot.showDiff(article.text, new_text) article.text = new_text comment = " ".join(comments) if not apply_all: answer = pywikibot.inputChoice( "Save {}?".format(article.title()), ["Yes", "No", "All"], ["Y", "N", "A"], "N") if answer == "a": apply_all = True else: apply_now = answer == "y" if apply_all or apply_now: article.save(comment=comment) else: print("Skipping...") if pywikibot.config.simulate: print("Summary: {}".format(comment))
def upload(g): print("upload") print(g["WIKI"]) p = page.Page(COMMONS, TEST_NAME) p.put(g["WIKI"] + p.text)