Beispiel #1
0
def doFillAbbrevs(scrapeLimit: Optional[int] = None) -> None:
    """Fill empty abbreviations in some automatizable cases.

    Currently the cases are:
    * abbreviation is equal to title, possibly without articles (a/the)
    """
    catName = 'Category:Infobox journals with missing ISO 4 abbreviations'
    cat = pywikibot.Category(Site(), catName)
    articles = cat.articles(namespaces=0, total=scrapeLimit, content=True)
    for n, page in enumerate(articles):
        print(f'--Scraping:\t{n}:\t[[{page.title()}]]', flush=True)
        for i, infobox in enumerate(getInfoboxJournals(page)):
            if infobox.get('abbreviation', '') != '':
                print('--Skipping infobox that actually has non-empty abbrev')
                continue
            title = abbrevUtils.stripTitle(page.title())
            if 'title' in infobox and infobox['title'] != title:
                print('--Skipping infobox with different title than article',
                      infobox['title'])
                continue
            cLang = abbrevUtils.getLanguage(infobox)
            cAbbrev = state.tryGetAbbrev(title, cLang)
            if cAbbrev is None:
                continue
            # If abbreviation is equal to title, up to "a/the" articles:
            if cAbbrev == re.sub(r'(The|the|A|a)\s+', '', title):
                print('--Filling "{}" with abbrev "{}"'.format(title, cAbbrev))
                trySaving(page,
                          fillAbbreviation(page.text, i, cAbbrev),
                          'Filling trivial ISO-4 abbreviation. ',
                          overwrite=True)
Beispiel #2
0
def addOmicsHatnote(aTitle: str, title: str, publisher: str) -> None:
    """Add hatnote to [[aTitle]] about confusion risk with OMICS [[title]]."""
    page = pywikibot.Page(Site(), aTitle)
    if '{{Confused|' in page.text or '{{confused|' in page.text:
        print(f'Skip: {{{{confused}}}} hatnote already on [[{aTitle}]]')
        return
    print(f'Adding hatnote to [[{aTitle}]]')
    hatnote = (f'{{{{Confused|text=[[{title}]],'
               f' published by the [[{publisher}]]}}}}\n')
    trySaving(page, hatnote + page.text, overwrite=True, limitType='hatnote',
              summary='Add hatnote to predatory journal clone.')
Beispiel #3
0
def makeAmpersandRedirects(pageTitle: str,
                           foreign: Set[str],
                           targetPageTitle: Optional[str] = None,
                           andToAmpersand: bool = True,
                           ampersandToAnd: bool = True) -> bool:
    """If pageTitle contains 'and'/'&', try creating redirect from '&'/'and'.

    `foreign` is a set of foreign-language titles to avoid.
    Return whether any edits made.
    """
    if len(pageTitle) > 95:
        print('Skipping (length): ', pageTitle)
        return False
    if not targetPageTitle:
        targetPageTitle = pageTitle
    rTitle = ''
    if ' and ' in pageTitle and andToAmpersand:
        rTitle = pageTitle.replace(' and ', ' & ')
        rTitle = rTitle.replace(', & ', ' & ')
    if ' & ' in pageTitle and ampersandToAnd:
        rTitle = pageTitle.replace(' & ', ' and ')
        # Exclude possibly-foreign titles based on categories and
        # on language detection.
        if pageTitle in foreign:
            print('Skipping (lang category): ', pageTitle)
            return False
        if not EnglishWordList.check(pageTitle):
            isReliable, _, details = \
                pycld2.detect(pageTitle, isPlainText=True)
            if not isReliable or details[0][0] != 'ENGLISH':
                print('Skipping (lang detect): ', pageTitle)
                print(isReliable, str(details))
                return False
    if not rTitle:
        return False
    # Try creating a redirect from rTitle to pageTitle.
    rPage = pywikibot.Page(Site(), rTitle)
    # Skip if the page already exists.
    if rPage.exists():
        print('Skipping (already exists): ', rTitle)
        return False
    # Create the redirect.
    print(f'Creating redirect from [[{rTitle}]] to [[{targetPageTitle}]]')
    rNewContent = (f'#REDIRECT [[{targetPageTitle}]]\n'
                   f'{{{{R from modification}}}}\n')
    summary = 'Redirect between ampersand/and variant.'
    return trySaving(rPage, rNewContent, summary, overwrite=False)
def makeVariantRedirect(vTitle: str, targetArticle: str) -> bool:
    """Try creating a redirect from vTitle to targetArticle."""
    rPage = pywikibot.Page(Site(), vTitle)
    # Skip if the page already exists.
    if rPage.exists():
        print('Skipping variant (already exists): ', vTitle)
        return False
    # Create the redirect.
    print(f'Creating redirect from [[{vTitle}]] to [[{targetArticle}]]')

    # Check number of results in Google search: only possible for <100 request.
    # sleepTime = 15
    # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    #            'AppleWebKit/537.36 (KHTML, like Gecko) '
    #            'Chrome/60.0.3112.113 Safari/537.36'}
    # url = 'https://www.google.com/search?'
    # url += urllib.parse.urlencode({'q': '"' + vTitle + '"'})
    # while True:
    #     try:
    #         sleep(sleepTime)
    #         req = urllib.request.Request(url, headers=headers)
    #         with urllib.request.urlopen(req) as response:
    #             html = str(response.read())
    #             if 'No results found' in html:
    #                 print('No Results')
    #                 return False
    #             regex = r'([0-9]+),?\s*([0-9]+),?\s*([0-9]*)\s*results'
    #             m = re.search(regex, html)
    #             if not m:
    #                 print('no Results')
    #                 return False
    #             res = m.group(1) + m.group(2) + m.group(3)
    #             print('Results=', res)
    #             if int(res) < 5:
    #                 return False
    #             break
    #     except urllib.error.URLError as err:
    #         print('Exception: ', sys.exc_info()[0], '\n', err.reason)
    #         sleepTime *= 2
    #         print('sleep=', sleepTime, flush=True)

    rNewContent = '#REDIRECT [[' + targetArticle + ']]\n'
    rNewContent += '{{R from abbreviation}}\n'
    summary = 'Redirect from variant abbreviation.'
    return utils.trySaving(rPage, rNewContent, summary, overwrite=False)
Beispiel #5
0
def fixPageRedirects(page: pywikibot.Page) -> int:
    """Fix redirects to given page."""
    title = page.title()
    pageData = state.getPageData(title)
    (requiredRedirects, skip) = getRequiredRedirects(page)
    nEditedPages = 0
    for rTitle, rCats in requiredRedirects.items():
        rNewContent = rcatSetToRedirectContent(title, rCats)
        # Attempt to create new redirect.
        if rTitle not in pageData['redirects']:
            try:
                exists = pywikibot.Page(Site(), rTitle).exists()
            except pywikibot.exceptions.InvalidTitle:
                exists = False
            if exists:
                print(f'--Skipping existing page [[{rTitle}]] '
                      f'(not a redirect to [[{title}]]).')
                if title == rTitle:
                    continue
                if title not in pywikibot.Page(Site(), rTitle).text:
                    reports.reportExistingOtherPage(title, rTitle)
            else:
                print(f'--Creating redirect '
                      f'from [[{rTitle}]] to [[{title}]]. '
                      f'Created content:\n{rNewContent}\n-----',
                      flush=True)
                nEditedPages += 1
                rPage = pywikibot.Page(Site(), rTitle)
                trySaving(rPage, rNewContent,
                          'Creating redirect from standard abbreviation. ',
                          overwrite=False)
        else:
            rOldContent = pageData['redirects'][rTitle]
            if isValidISO4Redirect(rOldContent, title, rCats):
                print(f'--Skipping existing valid redirect '
                      f'from [[{rTitle}]] to [[{title}]].')
            elif isReplaceableRedirect(rOldContent, title,
                                       rCats | RCatSet.ISO4):
                # Don't log nor edit redirects that would be replaceable
                # except they have ISO4 and we're not sure it should have.
                if not (rCats & RCatSet.ISO4):
                    continue
                print(f'--Replacing existing redirect '
                      f'from [[{rTitle}]] to [[{title}]].\n'
                      f'RCatSet: {rCats}\n'
                      f'Original content:\n{rOldContent}\n----- '
                      f'New content:\n{rNewContent}\n-----',
                      flush=True)
                nEditedPages += 1
                rPage = pywikibot.Page(Site(), rTitle)
                trySaving(rPage, rNewContent,
                          'Marking standard abbrev rcat. ',
                          overwrite=True)
            elif not skip:
                print(f'--Skipping existing dubious redirect '
                      f'from [[{rTitle}]] to [[{title}]].\n'
                      f'RCatSet: {rCats}\n'
                      f'Original content:\n{rOldContent}\n----- ')
                reports.reportExistingOtherRedirect(title, rTitle, rOldContent)
    # Purge page cache to remove warnings about missing redirects.
    if nEditedPages > 0:
        tryPurging(page)

    # Report redirects that we wouldn't add, but exist and are marked as ISO-4.
    if requiredRedirects and not skip:
        expectedAbbrevs = \
            [r.replace('.', '') for r in requiredRedirects]
        potentialAbbrevs = []
        for rTitle, rContent in pageData['redirects'].items():
            if 'from former name' in rContent or '.' not in rTitle:
                cAbbrevEng = state.tryGetAbbrev(
                    abbrevUtils.stripTitle(rTitle), 'eng') or ''
                cAbbrevAll = state.tryGetAbbrev(
                    abbrevUtils.stripTitle(rTitle), 'all') or ''
                cAbbrevEng = cAbbrevEng.replace('.', '')
                cAbbrevAll = cAbbrevAll.replace('.', '')
                if 'from former name' in rContent:
                    if cAbbrevEng != rTitle.replace('.', ''):
                        expectedAbbrevs.append(cAbbrevEng)
                    if cAbbrevAll != rTitle.replace('.', ''):
                        expectedAbbrevs.append(cAbbrevAll)
                elif '.' not in rTitle:
                    if cAbbrevEng != rTitle.replace('.', ''):
                        potentialAbbrevs.append((cAbbrevEng, rTitle))
                    if cAbbrevAll != rTitle.replace('.', ''):
                        potentialAbbrevs.append((cAbbrevAll, rTitle))
        expectedAbbrevs = [a for a in expectedAbbrevs if a]
        potentialAbbrevs = [(a, t) for (a, t) in potentialAbbrevs if a]
        for rTitle, rContent in pageData['redirects'].items():
            if not re.search(r'R from ISO 4', rContent):
                continue
            # Ignore rTitle that contain a computed abbreviation as a
            # substring, assume that it's some valid variation on a subtitle.
            isExpected = False
            rTitleDotless = rTitle.replace('.', '')
            for computedAbbrev in expectedAbbrevs:
                if re.sub(r'\s*[:(].*', '', computedAbbrev) in rTitleDotless:
                    isExpected = True
                    break
            if not isExpected:
                # Find other titles in existing redirects
                # that would ISO-4 abbreviate to it
                potentials = [t for (a, t) in potentialAbbrevs
                              if abbrevUtils.isSoftMatch(rTitleDotless, a)]
                potentials = list(sorted(set(potentials)))
                # Find closest computed abbrev.
                bestAbbrev = ''
                bestDist = len(rTitle)
                for computedAbbrev in sorted(requiredRedirects):
                    dist = Levenshtein.distance(rTitle, computedAbbrev)
                    if dist < bestDist:
                        bestDist = dist
                        bestAbbrev = computedAbbrev
                # Skip if closest abbrev. is far (assume it's from a former
                # title, since there's a ton of cases like that).
                if bestDist <= 8:
                    reports.reportSuperfluousRedirect(
                        title, rTitle, rContent, bestAbbrev, potentials)
    return nEditedPages
def fixRedirectAnchor(rTitle: str, anchor: str, target: str) -> bool:
    """Add an anchor to given redirect page."""
    rPage = pywikibot.Page(Site(), rTitle)
    addJournal = False
    if rPage.exists() and not rPage.isRedirectPage():
        addJournal = True
        if 'journal' in rTitle.lower():
            print(f'Skip: [[{rTitle}]] already exists, '
                  'title already has "journal".', flush=True)
            return False
        for cat in rPage.categories():
            if 'journal' in cat.title().lower():
                print(f'Skip: [[{rTitle}]] already exists, '
                      'has category containing "journal".', flush=True)
                return False
    if addJournal:
        rPage = pywikibot.Page(Site(), rTitle + ' (journal)')
    if not rPage.exists() or not rPage.isRedirectPage():
        print(f'Not exists/not a redirect: [[{rPage.title()}]]', flush=True)
        return False
    # Page.title() actually contains anchor, if redirect had one.
    actualTarget = rPage.getRedirectTarget().title().split('#', 1)
    if actualTarget[0] != target:
        print(f'Not a redirect to this list: '
              f'[[{rPage.title()}]] -> [[{actualTarget[0]}]]', flush=True)
        return False
    if len(actualTarget) > 1:
        if actualTarget[1] != anchor:
            print(f'WARNING: Anchor mismatch: '
                  f'[[{rPage.title()}]] -> [[{actualTarget[0]}]].'
                  f'Is "{actualTarget[1]}" should be "{anchor}".')
            return False
        else:
            return True
    predictedAnchor = getPredictedAnchor(rTitle)
    if predictedAnchor != anchor:
        print(f'WARNING: Anchor mismatch: '
              f'[[{rPage.title()}]] -> [[{actualTarget[0]}]].'
              f'Predicted "{predictedAnchor}" should be "{anchor}".')
        return False

    rText = rPage.text
    rNewText = re.sub(r'''(
                              \#\s*REDIRECT\s*\[\[
                              [^\]\#]+             # title
                          )
                          (\#[^\]]*)?              # anchor
                          \]\]''',
                      '\\1#' + anchor + ']]',
                      rText, count=1, flags=re.VERBOSE)
    if rText == rNewText:
        print(f'Nothing to do on: [[{rPage.title()}]]')
        return True
    print(f'===CHANGING [[{rPage.title()}]] FROM==================')
    print(rText)
    print('==========TO===========')
    print(rNewText + '\n\n', flush=True)
    trySaving(rPage, rNewText,
              'Add anchor to redirect, as it points to a long list.',
              overwrite=True)
    return True
Beispiel #7
0
def createOrFixOmicsRedirect(title: str, rType: str,
                             config: Config, tryOnly: bool) -> str:
    """Attempt to create or fix redirect from [[title]] to [[target]].

    We return 'create' if non-existing, 'done' if basically equal to what we
    would add, 'fix' if exists but looks fixable, 'unfixable' otherwise.
    Also create talk page with {{WPJournals}} when non-existing.
    """
    rText = '#REDIRECT[[' + config.rTarget + ']]\n'
    rCat = '[[Category:' + config.rCat + ']]\n' if config.rCat else ''
    rIsoCat = '{{R from ISO 4}}\n'
    rSortTitle = title
    if rSortTitle.startswith('The ') and '(' not in title:
        rSortTitle = rSortTitle.replace('The ', '') + ', The'
    if ' & ' in rSortTitle:
        rSortTitle = rSortTitle.replace(' & ', ' and ')
    if rSortTitle != title:
        rSort = '{{DEFAULTSORT:' + rSortTitle + '}}\n'
    if config.anchor:
        rText = '#REDIRECT[[' + config.rTarget + '#' + rSortTitle[0] + ']]\n'

    rNewContent = rText
    if rSortTitle != title:
        rNewContent += rSort
    if rType == 'plain':
        rNewContent += rCat
    if rType == 'iso4':
        rNewContent += '{{R from ISO 4}}\n'

    rPage = pywikibot.Page(Site(), title)
    rTalkPage = rPage.toggleTalkPage()
    if not rPage.exists():
        if rType == 'uniso4':
            return 'ignore'
        if not tryOnly:
            print(f'Creating redirect from: [[{title}]].')
            trySaving(rPage, rNewContent,
                      'Create redirect from journal to publisher.',
                      overwrite=False, limitType='create')
            if rType == 'plain' and not rTalkPage.exists():
                content = '{{WPJournals|class=redirect}}'
                trySaving(rTalkPage, content,
                          'Mark new redirect into {{WPJournals}}.',
                          overwrite=False, limitType='talk')
        return 'create'
    # If rPage exists, check if we would add basically the same.
    text = rPage.text
    textStripped = re.sub(r'\s', '', text, re.M).strip()
    rNewStripped = re.sub(r'\s', '', rNewContent, re.M).strip()
    if textStripped == rNewStripped:
        if not tryOnly:
            if rTalkPage.exists():
                print(f'Done: [[{title}]].')
            elif rType == 'plain':
                print(f'Done, but creating talk page: [[{title}]].')
                content = '{{WPJournals|class=redirect}}'
                trySaving(rTalkPage, content,
                          'Mark redirect into {{WPJournals}}.',
                          overwrite=False, limitType='talk')
        return 'done'
    # If rPage exists but not the same, check if it is a fixable case.
    if rCat:
        text = text.replace(rCat.strip(), '')
    text = text.replace(rIsoCat.strip(), '')
    text = re.sub(r'\{\{DEFAULTSORT:[^\}]*\}\}', '', text)
    # Strip link anchors and whitespace before comparing
    regex = r'(' + re.escape(config.rTarget) + r')\#.'
    textStripped = re.sub(regex, r'\1', text, re.M)
    textStripped = re.sub(r'\s', '', textStripped, re.M).strip()
    rTextStripped = re.sub(regex, r'\1', rText, re.M)
    rTextStripped = re.sub(r'\s', '', rTextStripped, re.M).strip()
    if textStripped != rTextStripped:
        print(f'Not fixable: [[{title}]]  (type={rType}).')
        print('---IS-------------')
        print(rPage.text)
        print('---SHOULD BE------')
        print(rNewContent)
        print('==================')
        return 'unfixable'
    # If it is fixable, fix it.
    if not tryOnly:
        if rType == 'uniso4':
            print(f'Removing iso4 tag from: [[{title}]].')
        print(f'Fixing redirect from: [[{title}]] (type={rType}).')
        print('---WAS------------')
        print(rPage.text)
        print('---WILL BE--------')
        print(rNewContent)
        print('==================')
        trySaving(rPage, rNewContent,
                  'Fix redirect from journal to publisher.',
                  overwrite=True, limitType='fix')
        if rType == 'plain' and not rTalkPage.exists():
            content = '{{WPJournals|class=redirect}}'
            trySaving(rTalkPage, content,
                      'Fix redirect from journal to publisher.',
                      overwrite=False, limitType='talk')
    return 'fix'