Example #1
0
def harvest(thesaurus):
    p = page.Page(WIKTIONNAIRE, thesaurusPrefix+thesaurus+fr)
    if p.exists and thesaurus not in cache["thesaurus"]:
        print thesaurus.upper()
        cache["thesaurus"][thesaurus]=[]
        text = page.Page(WIKTIONNAIRE, thesaurusPrefix+thesaurus+fr).text
        wikiWords = [x[2:len(x)-2] for x in link.findall(text)]
        for wikiWord in wikiWords:
            if sources(wikiWord):
                cache["thesaurus"][thesaurus].append(wikiWord)
    write()
Example #2
0
def harvest(thesaurus):
    """Harvest information about a given Thesaurus and save cache"""
    LOG.info("Harvesting: %s", thesaurus)
    p = page.Page(WIKTIONNAIRE, thesaurusPrefix + thesaurus + fr)
    if p.exists and thesaurus not in CACHE["thesaurus"]:
        LOG.debug(thesaurus.upper())
        CACHE["thesaurus"][thesaurus] = []
        text = page.Page(WIKTIONNAIRE, thesaurusPrefix + thesaurus + fr).text
        wikiWords = [x[2:len(x) - 2] for x in link.findall(text)]
        for wikiWord in wikiWords:
            if sources(wikiWord):
                CACHE["thesaurus"][thesaurus].append(wikiWord)
    cache.save(CACHE)
Example #3
0
def update():
    LOG.info("updating Wikimedia projects")
    data = request.args.get('data', 0, type=str)
    d = ast.literal_eval(data)
    for cluster in d:
        if cluster["id"] is not "unclustered" and len(cluster["images"]) > 0:
            cluster["images"] = [idMap[img] for img in cluster["images"]]
            fusion_cat([page.Page(COMMONS, img) for img in cluster["images"]])
    return render_template('result.html', **result)
Example #4
0
def gender(author):
    result = []
    wikiArticle = page.Page(WIKIPEDIA, author)
    if wikiArticle.exists():
        item = wikiArticle.data_item()
        if gender in item.claims:
            for claim in item.claims[gender]:
                if claim.getTarget() is not None:
                    result.add(claim.getTarget().id)
    return {author:{genders:result}}
Example #5
0
def categories(p, height=0):
    if p not in categories_tree or "Parents" not in categories_tree[p]:
        categories_tree[p] = {
            "Parents": [c.title() for c in page.Page(COMMONS, p).categories()]
        }
    cats = set(categories_tree[p]["Parents"])
    if height is 0:
        return cats
    else:
        temp = set(cats)
        for cat in cats:
            temp |= categories(cat, height - 1)
        return temp
Example #6
0
def visualize(category_name, clusters):
    test_page = page.Page(COMMONS, "User:Donna Nobot/clusterArtworks")
    stringBuffer = [test_page.text]
    stringBuffer.append("\n\n== [[:Category:")
    stringBuffer.append(category_name)
    stringBuffer.append("|")
    stringBuffer.append(category_name)
    stringBuffer.append("]] ==\n")
    for cluster in clusters:
        stringBuffer.append("<gallery mode=\"packed\">\n")
        for file in cluster:
            stringBuffer.append(file)
            stringBuffer.append("\n")
        stringBuffer.append("</gallery>\n\n")
    test_page.put("".join(stringBuffer), "#clusterArworks")
Example #7
0
def characteristics(author):
    print "\t"+author
    result={}
    wikiArticle = page.Page(WIKIPEDIA, author)
    if wikiArticle.exists():
        while wikiArticle.isRedirectPage():
            wikiArticle = wikiArticle.getRedirectTarget()
        item = wikiArticle.data_item()
        for data in datas:
            if data in item.claims:
                for claim in item.claims[data]:
                    if claim.getTarget() is not None:
                        if data is birthDate:
                            result[data]=claim.getTarget().year
                        else:
                            result[data]=claim.getTarget().id
    return result
Example #8
0
def sources(word):
    result={
        'Authors'=[]
    }
    text = page.Page(WIKTIONNAIRE, word).text
    templates = source.findall(text)
    for template in templates:
        #Authors are linked to wikipedia
        wikiAuthors = wAuthor.findall()
        for wikiAuthor in wikiAuthors:
            if wikiAuthor not in result["authors"]:
                result["authors"].add(wikiAuthor)
                genders = gender(wikiAuthor)
                if wikiAuthor not in cache["authors"]:
                    cache["authors"][wikiAuthor]=genders
                else:
                    cache["authors"][wikiAuthor].addAll(genders)
    return {word:result}
Example #9
0
def characteristics(author):
    """Find characteristics for an author on Wikidata."""
    LOG.debug("\t%s", author)
    result = {}
    wikiArticle = page.Page(WIKIPEDIA, author)
    if wikiArticle.exists():
        while wikiArticle.isRedirectPage():
            wikiArticle = wikiArticle.getRedirectTarget()
        item = wikiArticle.data_item()
        for data in datas:
            if data in item.claims:
                for claim in item.claims[data]:
                    if claim.getTarget() is not None:
                        if data is birthDate:
                            result[data] = claim.getTarget().year
                        else:
                            result[data] = claim.getTarget().id
    return result
Example #10
0
def sources(word):
    print word
    p = page.Page(WIKTIONNAIRE, word)
    if p.exists:
        cache[words][word]=[]
        text = p.text
        templates = source.findall(text)
        for template in templates:
            #Authors are linked to wikipedia
            wikiAuthors = [x[4:len(x)-2] for x in wAuthor.findall(template)]
            for wikiAuthor in wikiAuthors:
                if wikiAuthor not in cache[authors]:
                    cache[authors][wikiAuthor] = characteristics(wikiAuthor)
                    cache[authors][wikiAuthor][words] = []
                cache[words][word].append(wikiAuthor)
                cache[authors][wikiAuthor][words].append(word)
        return True
    else:
        return False
Example #11
0
def sources(word):
    """Find sources information for word."""
    LOG.debug(word)
    p = page.Page(WIKTIONNAIRE, word)
    if p.exists:
        CACHE[words][word] = []
        text = p.text
        templates = source.findall(text)
        for template in templates:
            # Authors are linked to wikipedia
            wikiAuthors = [x[4:len(x) - 2] for x in wAuthor.findall(template)]
            wikiAuthors += [
                x[0] + " " + x[1] for x in nomWAuthor.findall(template)
            ]
            for wikiAuthor in wikiAuthors:
                if wikiAuthor not in CACHE[authors]:
                    CACHE[authors][wikiAuthor] = characteristics(wikiAuthor)
                    CACHE[authors][wikiAuthor][words] = []
                CACHE[words][word].append(wikiAuthor)
                CACHE[authors][wikiAuthor][words].append(word)
        return True
    else:
        return False
Example #12
0
def scan_list(pages, new_description, reorder_sections, must_be_part):
    apply_all = False
    for article in pages:
        is_english = not NOT_ENGLISH.match(article.title())
        quote_parameter = ENGLISH_QUOTE_PARAMETER if is_english else NOT_ENGLISH_QUOTE_PARAMETER
        print("======================================")
        print("Working on: {}".format(article.title()))
        parsed = mwparserfromhell.parse(article.text)
        comment = {
            "desc title": 0,
            "desc to quote": 0,
        }
        comments = []
        is_part = False
        for template in parsed.filter_templates():
            template_name = template.name.lower().strip()
            if template_name == "description":
                template.name = "Quote"
                _minc(comment, "desc to quote")
            elif template_name.find("infobox/part") >= 0 or template_name.find(
                    "partbox") >= 0:
                is_part = True
                print(
                    "NOTE: Page '{}' does not use outsourced infobox template."
                )
            else:
                box_page_match = BOX_TEMPLATE.match(template.name.strip())
                if box_page_match:
                    # read box template
                    box_page = page.Page(site, box_page_match.group(1))
                    for box_template in mwparserfromhell.parse(
                            box_page.text).filter_templates():
                        box_template_name = box_template.name.lower().strip()
                        if box_template_name.find(
                                "infobox/part") >= 0 or box_template_name.find(
                                    "partbox") >= 0:
                            if box_template_name.find("partbox") >= 0:
                                print("NOTE: Page '{}' uses Partbox".format(
                                    box_page.title()))
                            is_part = True
                            break
        # ONLY handle part pages (not pages like Part)
        if not is_part and must_be_part:
            print("NOTE: Skipped '{}' because is not part page.".format(
                article.title()))
            continue
        if is_english:
            for heading in parsed.filter_headings():
                old = heading.title
                if heading.title.strip() != new_description and is_description(
                        heading.title):
                    heading.title = new_description
                    if old != heading.title:
                        _minc(comment, "desc title")
        # only read == .. == sections and not omit the first section
        sections = parsed.get_sections(levels=[2], include_lead=True)
        if reorder_sections:
            # quick and dirty workaround to have the footer in the last section
            last_section = unicode(sections[-1])
            footer_start = last_section.find("{{Parts}}")
            if last_section[footer_start -
                            1] == "\n" and last_section[footer_start -
                                                        2] == "\n":
                while last_section[footer_start - 3] == "\n":
                    footer_start -= 1
            if footer_start < 0:
                print("ERROR: No {{Parts}} in last section")
            else:
                footer_less = last_section[:footer_start]
                footer = last_section[footer_start:]
                sections[-1] = mwparserfromhell.parse(footer_less)
            # the next line is not part of the workaround
            sorted_sections = sorted(sections, key=get_order)
            if footer_start >= 0:
                last_section = unicode(sorted_sections[-1])
                if last_section[-1] != "\n":
                    last_section += "\n"
                last_section += footer
                sorted_sections[-1] = mwparserfromhell.parse(last_section)
            # workaround ends here
            for i in range(0, len(sections)):
                if i >= len(sorted_sections) or get_heading(
                        sections[i]) != get_heading(sorted_sections[i]):
                    comments += ["*sorted order of sections;"]
                    break
            sections = sorted_sections
            new_text = ""

        for section in sections:
            heading = get_heading(section)
            if heading:
                heading.title = " {} ".format(heading.title.strip())
                if is_description(heading.title):
                    for template in section.filter_templates():
                        if template.name.lower().strip() == "quote":
                            author = None
                            for param in template.params:
                                if param.name == "2":
                                    author = param
                                elif param.name not in quote_parameter:
                                    break
                            else:
                                if author:
                                    template.remove(author)
                                    comments += [
                                        "-removed author for description;"
                                    ]
                                break  #only change the first quote template!
            if reorder_sections:
                new_text += "{}".format(section)
        if not reorder_sections:
            new_text = "{}".format(parsed)
        if comment["desc title"]:
            comments += [
                "*updated {} description title{};".format(
                    *_plural(comment["desc title"]))
            ]
        if comment["desc to quote"]:
            comments += [
                "*replaced {0} description template{1} with quote template{1};"
                .format(*_plural(comment["desc to quote"]))
            ]
        if comments:
            pywikibot.showDiff(article.text, new_text)
            article.text = new_text
            comment = " ".join(comments)
            if not apply_all:
                answer = pywikibot.inputChoice(
                    "Save {}?".format(article.title()), ["Yes", "No", "All"],
                    ["Y", "N", "A"], "N")
                if answer == "a":
                    apply_all = True
                else:
                    apply_now = answer == "y"
            if apply_all or apply_now:
                article.save(comment=comment)
            else:
                print("Skipping...")
            if pywikibot.config.simulate:
                print("Summary: {}".format(comment))
Example #13
0
def upload(g):
    print("upload")
    print(g["WIKI"])
    p = page.Page(COMMONS, TEST_NAME)
    p.put(g["WIKI"] + p.text)