Beispiel #1
0
 def parse_all_fname_tags(fname):
     _tags = [splitext(fname)[0]]
     _tags = ut.flatten([t.split('_') for t in _tags])
     _tags = ut.flatten([t.split('.') for t in _tags])
     _tags = [t.lower() for t in _tags]
     _tags = [tag_alias_map.get(t, t) for t in _tags]
     for key, vals in regex_alias_map.items():
         pat = ut.regex_or(vals)
         _tags = [key if re.match(pat, t) else t for t in _tags]
     pat = ut.regex_or(invalid_tag_patterns)
     _tags = [t for t in _tags if not re.match(pat, t)]
     _tags = ut.unique_ordered(_tags)
     return _tags
Beispiel #2
0
 def parse_all_fname_tags(fname):
     _tags = [splitext(fname)[0]]
     _tags = ut.flatten([t.split('_') for t in _tags])
     _tags = ut.flatten([t.split('.') for t in _tags])
     _tags = [t.lower() for t in _tags]
     _tags = [tag_alias_map.get(t, t) for t in _tags]
     for key, vals in regex_alias_map.items():
         pat = ut.regex_or(vals)
         _tags = [key if re.match(pat, t) else t for t in _tags]
     pat = ut.regex_or(invalid_tag_patterns)
     _tags = [t for t in _tags if not re.match(pat, t)]
     _tags = ut.unique_ordered(_tags)
     return _tags
Beispiel #3
0
 def fix_authors(cleaner):
     # Fix Authors
     if 'author' in cleaner.entry:
         authors = six.text_type(cleaner.entry['author'])
         for truename, alias_list in constants_tex_fixes.AUTHOR_NAME_MAPS.items(
         ):
             pattern = six.text_type(
                 ut.regex_or([
                     ut.util_regex.whole_word(alias) for alias in alias_list
                 ]))
             authors = re.sub(pattern,
                              six.text_type(truename),
                              authors,
                              flags=re.UNICODE)
         cleaner.entry['author'] = authors
Beispiel #4
0
    def fix_capitalization(match):
        dict_ = match.groupdict()
        section_title = dict_['section_title']
        #if section_title == 'The Great Zebra Count':
        #    return match.string[slice(*match.span())]
        #    #return 'The Great Zebra Count'
        # general logic
        #words = section_title.split(' ')
        tokens = re.split(ut.regex_or([' ', '/']), section_title)
        #if 'Coverage' in section_title:
        #    ut.embed()
        #    pass
        #words = [word if count == 0 else word.lower() for count, word in enumerate(words)]
        #new_section_title = ' '.join(words)
        tokens = [
            t if count == 0 else t.lower() for count, t in enumerate(tokens)
        ]
        new_section_title = ''.join(tokens)

        # hacks for caps of expanded titles
        search_repl_list = constants_tex_fixes.CAPITAL_TITLE_LIST
        for repl in search_repl_list:
            new_section_title = re.sub(re.escape(repl),
                                       repl,
                                       new_section_title,
                                       flags=re.IGNORECASE)
        # hacks fo acronyms
        for full, acro in constants_tex_fixes.ACRONYMN_LIST:
            new_section_title = re.sub(r'\b' + re.escape(acro) + r'\b',
                                       acro,
                                       new_section_title,
                                       flags=re.IGNORECASE)

        #'the great zebra and giraffe count'

        #new_section_title = section_title.lower()
        new_text = dict_['spaces'] + '\\' + dict_[
            'section_type'] + '{' + new_section_title + '}'
        VERBOSE = 0
        if VERBOSE:
            old_text = match.string[slice(*match.span())]
            if new_text != old_text:
                print(ut.dict_str(dict_))
                print('--- REPL ---')
                print(old_text)
                print(new_text)
        return new_text
Beispiel #5
0
def tokenize_manacost(mana_cost):
    r"""

    CommandLine:
        python -m mtgmonte.mtgobjs --exec-tokenize_manacost

    Example:
        >>> # ENABLE_DOCTEST
        >>> from mtgmonte.mtgobjs import *  # NOQA
        >>> cards = load_cards(['Naya Hushblade', 'Gitaxian Probe', 'Spectral Procession', 'Emrakul, the Aeons Torn'])
        >>> manacost_list = [card.mana_cost for card in cards]
        >>> result = (ut.repr2([tokenize_manacost(mana_cost) for mana_cost in manacost_list], nl=2, nobraces=True))
        >>> print(result)
        [
            ('(R/W)', 'hybrid'),
            ('G', 'colored'),
        ],
        [
            ('(U/P)', 'phyrexian'),
        ],
        [
            ('(2/W)', 'hybrid'),
            ('(2/W)', 'hybrid'),
            ('(2/W)', 'hybrid'),
        ],
        [
            ('15', 'uncolored'),
        ],
    """
    if mana_cost == '{*}':
        return [('*', 'special')]
    colored_pat = ut.named_field('colored', '[' + MANA_SYMBOLS + 'C' + ']')
    uncolored_pat = ut.named_field('uncolored', '[0-9]+', )
    life_pat = ut.named_field('life', 'P', )
    phyrexian_pat = ut.named_field('phyrexian', '\([' + MANA_SYMBOLS + ']/P\)')
    hybrid_pat = ut.named_field('hybrid', '\([0-9' + MANA_SYMBOLS + ']/[' + MANA_SYMBOLS + ']\)')
    patern = ut.regex_or([uncolored_pat, colored_pat, hybrid_pat, phyrexian_pat, life_pat])
    groupdicts = [x.groupdict() for x in re.finditer(patern, mana_cost)]
    tokens = [(v, k) for d in groupdicts for k, v in d.items() if v is not None]
    # tokens = [x.groups() for x in re.finditer(patern, card.mana_cost)]
    # assert all([len(t) == 1 for t in tokens])
    # tokens = [t[0] for t in tokens]
    return tokens
Beispiel #6
0
def fix_section_title_capitalization(tex_fpath, dryrun=True):
    # Read in text and ensure ascii format
    text = ut.read_from(tex_fpath)

    section_type_list = [
        'chapter',
        'section',
        'subsection',
        'subsubsection',
        'paragraph',
    ]
    re_section_type = ut.named_field('section_type',
                                     ut.regex_or(section_type_list))
    re_section_title = ut.named_field('section_title', '[^}]*')

    re_spaces = ut.named_field('spaces', '^ *')

    pattern = re_spaces + re.escape(
        '\\') + re_section_type + '{' + re_section_title + '}'

    def fix_capitalization(match):
        dict_ = match.groupdict()
        section_title = dict_['section_title']
        #if section_title == 'The Great Zebra Count':
        #    return match.string[slice(*match.span())]
        #    #return 'The Great Zebra Count'
        # general logic
        #words = section_title.split(' ')
        tokens = re.split(ut.regex_or([' ', '/']), section_title)
        #if 'Coverage' in section_title:
        #    ut.embed()
        #    pass
        #words = [word if count == 0 else word.lower() for count, word in enumerate(words)]
        #new_section_title = ' '.join(words)
        tokens = [
            t if count == 0 else t.lower() for count, t in enumerate(tokens)
        ]
        new_section_title = ''.join(tokens)

        # hacks for caps of expanded titles
        search_repl_list = constants_tex_fixes.CAPITAL_TITLE_LIST
        for repl in search_repl_list:
            new_section_title = re.sub(re.escape(repl),
                                       repl,
                                       new_section_title,
                                       flags=re.IGNORECASE)
        # hacks fo acronyms
        for full, acro in constants_tex_fixes.ACRONYMN_LIST:
            new_section_title = re.sub(r'\b' + re.escape(acro) + r'\b',
                                       acro,
                                       new_section_title,
                                       flags=re.IGNORECASE)

        #'the great zebra and giraffe count'

        #new_section_title = section_title.lower()
        new_text = dict_['spaces'] + '\\' + dict_[
            'section_type'] + '{' + new_section_title + '}'
        VERBOSE = 0
        if VERBOSE:
            old_text = match.string[slice(*match.span())]
            if new_text != old_text:
                print(ut.dict_str(dict_))
                print('--- REPL ---')
                print(old_text)
                print(new_text)
        return new_text

    #for match in re.finditer(pattern, text, flags=re.MULTILINE):
    #    fix_capitalization(match)

    new_text = re.sub(pattern, fix_capitalization, text, flags=re.MULTILINE)

    if not dryrun:
        ut.write_to(tex_fpath, new_text)
    else:
        ut.print_difftext(ut.get_textdiff(text, new_text, 0))
Beispiel #7
0
def fix_conference_title_names(clean_text, key_list=None):
    """
    mass bibtex fixes

    CommandLine:
        ./fix_bib.py
    """

    # Find citations from the tex documents
    if key_list is None:
        key_list = find_used_citations(testdata_fpaths())
        key_list = list(set(key_list))
        ignore = ['JP', '?']
        for item in ignore:
            try:
                key_list.remove(item)
            except ValueError:
                pass

    unknown_confkeys = []

    conference_keys = [
        'journal',
        'booktitle',
    ]

    ignore_confkey = []

    bib_database = bibtexparser.loads(clean_text)

    bibtex_dict = bib_database.get_entry_dict()

    isect = set(ignore_confkey).intersection(
        set(constants_tex_fixes.CONFERENCE_TITLE_MAPS.keys()))
    assert len(isect) == 0, repr(isect)

    #ut.embed()
    #conftitle_to_types_hist = ut.ddict(list)

    type_key = 'ENTRYTYPE'

    debug_author = ut.get_argval('--debug-author', type_=str, default=None)
    # ./fix_bib.py --debug_author=Kappes

    for key in bibtex_dict.keys():
        entry = bibtex_dict[key]

        if debug_author is not None:
            debug = debug_author in entry.get('author', '')
        else:
            debug = False

        if debug:
            print(' --- ENTRY ---')
            print(ut.repr3(entry))

        #if type_key not in entry:
        #    #entry[type_key] = entry['ENTRYTYPE']
        #    ut.embed()

        # Clip abstrat
        if 'abstract' in entry:
            entry['abstract'] = ' '.join(entry['abstract'].split(' ')[0:7])

        # Remove Keys
        remove_keys = [
            'note',
            'urldate',
            'series',
            'publisher',
            'isbn',
            'editor',
            'shorttitle',
            'copyright',
            'language',
            'month',
            # These will be put back in
            #'number',
            #'pages',
            #'volume',
        ]
        entry = ut.delete_dict_keys(entry, remove_keys)

        # Fix conference names
        confkeys = list(set(entry.keys()).intersection(set(conference_keys)))
        #entry = ut.delete_dict_keys(entry, ['abstract'])
        # TODO: FIX THESE IF NEEDBE
        #if len(confkeys) == 0:
        #    print(ut.dict_str(entry))
        #    print(entry.keys())
        if len(confkeys) == 1:
            confkey = confkeys[0]
            old_confval = entry[confkey]
            # Remove curly braces
            old_confval = old_confval.replace('{', '').replace('}', '')
            if old_confval in ignore_confkey:
                print(ut.dict_str(entry))
                continue

            new_confval_candiates = []
            if old_confval.startswith('arXiv'):
                continue

            # for conf_title, patterns in constants_tex_fixes.CONFERENCE_TITLE_MAPS.items():
            for conf in constants_tex_fixes.CONFERENCES:
                if conf.matches(old_confval):
                    conf_title = conf.accro()
                    if debug:
                        print('old_confval = %r' % (old_confval, ))
                        print('conf_title = %r' % (conf_title, ))
                    new_confval = conf_title
                    new_confval_candiates.append(new_confval)

            if len(new_confval_candiates) == 0:
                new_confval = None
            elif len(new_confval_candiates) == 1:
                new_confval = new_confval_candiates[0]
            else:
                assert False, 'double match'

            if new_confval is None:
                if key in key_list:
                    unknown_confkeys.append(old_confval)
                #print(old_confval)
            else:
                # Overwrite old confval
                entry[confkey] = new_confval

            # Record info about types of conferneces
            true_confval = entry[confkey].replace('{', '').replace('}', '')

            # FIX ENTRIES THAT SHOULD BE CONFERENCES
            if true_confval in constants_tex_fixes.CONFERENCE_LIST:
                if entry[type_key] == 'inproceedings':
                    pass
                    #print(confkey)
                    #print(ut.dict_str(entry))
                elif entry[type_key] == 'article':
                    entry['booktitle'] = entry['journal']
                    del entry['journal']
                    #print(ut.dict_str(entry))
                elif entry[type_key] == 'incollection':
                    pass
                else:
                    raise AssertionError('UNKNOWN TYPE: %r' %
                                         (entry[type_key], ))

                if 'booktitle' not in entry:
                    print('DOES NOT HAVE CORRECT CONFERENCE KEY')
                    print(ut.dict_str(entry))

                assert 'journal' not in entry, 'should not have journal'

                #print(entry['type'])
                entry[type_key] = 'inproceedings'

            # FIX ENTRIES THAT SHOULD BE JOURNALS
            if true_confval in constants_tex_fixes.JOURNAL_LIST:

                if entry[type_key] == 'article':
                    pass
                elif entry[type_key] == 'inproceedings':
                    pass
                    #print(ut.dict_str(entry))
                elif entry[type_key] == 'incollection':
                    pass
                else:
                    raise AssertionError('UNKNOWN TYPE: %r' %
                                         (entry['type'], ))

                if 'journal' not in entry:
                    print('DOES NOT HAVE CORRECT CONFERENCE KEY')
                    print(ut.dict_str(entry))

                assert 'booktitle' not in entry, 'should not have booktitle'
                #print(entry['type'])
                #entry['type'] = 'article'

            #conftitle_to_types_hist[true_confval].append(entry['type'])

        elif len(confkeys) > 1:
            raise AssertionError('more than one confkey=%r' % (confkeys, ))

        # Fix Authors
        if 'author' in entry:
            authors = six.text_type(entry['author'])
            for truename, alias_list in constants_tex_fixes.AUTHOR_NAME_MAPS.items(
            ):
                pattern = six.text_type(
                    ut.regex_or([
                        ut.util_regex.whole_word(alias) for alias in alias_list
                    ]))
                authors = re.sub(pattern,
                                 six.text_type(truename),
                                 authors,
                                 flags=re.UNICODE)
            entry['author'] = authors
    """
    article = journal
    inprocedings = converence paper

    """

    #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()}
    #print(ut.dict_str(conftitle_to_types_set_hist))

    print(ut.list_str(sorted(unknown_confkeys)))
    print('len(unknown_confkeys) = %r' % (len(unknown_confkeys), ))

    writer = BibTexWriter()
    writer.contents = ['comments', 'entries']
    writer.indent = '  '
    writer.order_entries_by = ('type', 'author', 'year')

    new_bibtex_str = bibtexparser.dumps(bib_database, writer)
    return new_bibtex_str