def parse_all_fname_tags(fname): _tags = [splitext(fname)[0]] _tags = ut.flatten([t.split('_') for t in _tags]) _tags = ut.flatten([t.split('.') for t in _tags]) _tags = [t.lower() for t in _tags] _tags = [tag_alias_map.get(t, t) for t in _tags] for key, vals in regex_alias_map.items(): pat = ut.regex_or(vals) _tags = [key if re.match(pat, t) else t for t in _tags] pat = ut.regex_or(invalid_tag_patterns) _tags = [t for t in _tags if not re.match(pat, t)] _tags = ut.unique_ordered(_tags) return _tags
def fix_authors(cleaner): # Fix Authors if 'author' in cleaner.entry: authors = six.text_type(cleaner.entry['author']) for truename, alias_list in constants_tex_fixes.AUTHOR_NAME_MAPS.items( ): pattern = six.text_type( ut.regex_or([ ut.util_regex.whole_word(alias) for alias in alias_list ])) authors = re.sub(pattern, six.text_type(truename), authors, flags=re.UNICODE) cleaner.entry['author'] = authors
def fix_capitalization(match): dict_ = match.groupdict() section_title = dict_['section_title'] #if section_title == 'The Great Zebra Count': # return match.string[slice(*match.span())] # #return 'The Great Zebra Count' # general logic #words = section_title.split(' ') tokens = re.split(ut.regex_or([' ', '/']), section_title) #if 'Coverage' in section_title: # ut.embed() # pass #words = [word if count == 0 else word.lower() for count, word in enumerate(words)] #new_section_title = ' '.join(words) tokens = [ t if count == 0 else t.lower() for count, t in enumerate(tokens) ] new_section_title = ''.join(tokens) # hacks for caps of expanded titles search_repl_list = constants_tex_fixes.CAPITAL_TITLE_LIST for repl in search_repl_list: new_section_title = re.sub(re.escape(repl), repl, new_section_title, flags=re.IGNORECASE) # hacks fo acronyms for full, acro in constants_tex_fixes.ACRONYMN_LIST: new_section_title = re.sub(r'\b' + re.escape(acro) + r'\b', acro, new_section_title, flags=re.IGNORECASE) #'the great zebra and giraffe count' #new_section_title = section_title.lower() new_text = dict_['spaces'] + '\\' + dict_[ 'section_type'] + '{' + new_section_title + '}' VERBOSE = 0 if VERBOSE: old_text = match.string[slice(*match.span())] if new_text != old_text: print(ut.dict_str(dict_)) print('--- REPL ---') print(old_text) print(new_text) return new_text
def tokenize_manacost(mana_cost): r""" CommandLine: python -m mtgmonte.mtgobjs --exec-tokenize_manacost Example: >>> # ENABLE_DOCTEST >>> from mtgmonte.mtgobjs import * # NOQA >>> cards = load_cards(['Naya Hushblade', 'Gitaxian Probe', 'Spectral Procession', 'Emrakul, the Aeons Torn']) >>> manacost_list = [card.mana_cost for card in cards] >>> result = (ut.repr2([tokenize_manacost(mana_cost) for mana_cost in manacost_list], nl=2, nobraces=True)) >>> print(result) [ ('(R/W)', 'hybrid'), ('G', 'colored'), ], [ ('(U/P)', 'phyrexian'), ], [ ('(2/W)', 'hybrid'), ('(2/W)', 'hybrid'), ('(2/W)', 'hybrid'), ], [ ('15', 'uncolored'), ], """ if mana_cost == '{*}': return [('*', 'special')] colored_pat = ut.named_field('colored', '[' + MANA_SYMBOLS + 'C' + ']') uncolored_pat = ut.named_field('uncolored', '[0-9]+', ) life_pat = ut.named_field('life', 'P', ) phyrexian_pat = ut.named_field('phyrexian', '\([' + MANA_SYMBOLS + ']/P\)') hybrid_pat = ut.named_field('hybrid', '\([0-9' + MANA_SYMBOLS + ']/[' + MANA_SYMBOLS + ']\)') patern = ut.regex_or([uncolored_pat, colored_pat, hybrid_pat, phyrexian_pat, life_pat]) groupdicts = [x.groupdict() for x in re.finditer(patern, mana_cost)] tokens = [(v, k) for d in groupdicts for k, v in d.items() if v is not None] # tokens = [x.groups() for x in re.finditer(patern, card.mana_cost)] # assert all([len(t) == 1 for t in tokens]) # tokens = [t[0] for t in tokens] return tokens
def fix_section_title_capitalization(tex_fpath, dryrun=True): # Read in text and ensure ascii format text = ut.read_from(tex_fpath) section_type_list = [ 'chapter', 'section', 'subsection', 'subsubsection', 'paragraph', ] re_section_type = ut.named_field('section_type', ut.regex_or(section_type_list)) re_section_title = ut.named_field('section_title', '[^}]*') re_spaces = ut.named_field('spaces', '^ *') pattern = re_spaces + re.escape( '\\') + re_section_type + '{' + re_section_title + '}' def fix_capitalization(match): dict_ = match.groupdict() section_title = dict_['section_title'] #if section_title == 'The Great Zebra Count': # return match.string[slice(*match.span())] # #return 'The Great Zebra Count' # general logic #words = section_title.split(' ') tokens = re.split(ut.regex_or([' ', '/']), section_title) #if 'Coverage' in section_title: # ut.embed() # pass #words = [word if count == 0 else word.lower() for count, word in enumerate(words)] #new_section_title = ' '.join(words) tokens = [ t if count == 0 else t.lower() for count, t in enumerate(tokens) ] new_section_title = ''.join(tokens) # hacks for caps of expanded titles search_repl_list = constants_tex_fixes.CAPITAL_TITLE_LIST for repl in search_repl_list: new_section_title = re.sub(re.escape(repl), repl, new_section_title, flags=re.IGNORECASE) # hacks fo acronyms for full, acro in constants_tex_fixes.ACRONYMN_LIST: new_section_title = re.sub(r'\b' + re.escape(acro) + r'\b', acro, new_section_title, flags=re.IGNORECASE) #'the great zebra and giraffe count' #new_section_title = section_title.lower() new_text = dict_['spaces'] + '\\' + dict_[ 'section_type'] + '{' + new_section_title + '}' VERBOSE = 0 if VERBOSE: old_text = match.string[slice(*match.span())] if new_text != old_text: print(ut.dict_str(dict_)) print('--- REPL ---') print(old_text) print(new_text) return new_text #for match in re.finditer(pattern, text, flags=re.MULTILINE): # fix_capitalization(match) new_text = re.sub(pattern, fix_capitalization, text, flags=re.MULTILINE) if not dryrun: ut.write_to(tex_fpath, new_text) else: ut.print_difftext(ut.get_textdiff(text, new_text, 0))
def fix_conference_title_names(clean_text, key_list=None): """ mass bibtex fixes CommandLine: ./fix_bib.py """ # Find citations from the tex documents if key_list is None: key_list = find_used_citations(testdata_fpaths()) key_list = list(set(key_list)) ignore = ['JP', '?'] for item in ignore: try: key_list.remove(item) except ValueError: pass unknown_confkeys = [] conference_keys = [ 'journal', 'booktitle', ] ignore_confkey = [] bib_database = bibtexparser.loads(clean_text) bibtex_dict = bib_database.get_entry_dict() isect = set(ignore_confkey).intersection( set(constants_tex_fixes.CONFERENCE_TITLE_MAPS.keys())) assert len(isect) == 0, repr(isect) #ut.embed() #conftitle_to_types_hist = ut.ddict(list) type_key = 'ENTRYTYPE' debug_author = ut.get_argval('--debug-author', type_=str, default=None) # ./fix_bib.py --debug_author=Kappes for key in bibtex_dict.keys(): entry = bibtex_dict[key] if debug_author is not None: debug = debug_author in entry.get('author', '') else: debug = False if debug: print(' --- ENTRY ---') print(ut.repr3(entry)) #if type_key not in entry: # #entry[type_key] = entry['ENTRYTYPE'] # ut.embed() # Clip abstrat if 'abstract' in entry: entry['abstract'] = ' '.join(entry['abstract'].split(' ')[0:7]) # Remove Keys remove_keys = [ 'note', 'urldate', 'series', 'publisher', 'isbn', 'editor', 'shorttitle', 'copyright', 'language', 'month', # These will be put back in #'number', #'pages', #'volume', ] entry = ut.delete_dict_keys(entry, remove_keys) # Fix conference names confkeys = list(set(entry.keys()).intersection(set(conference_keys))) #entry = ut.delete_dict_keys(entry, ['abstract']) # TODO: FIX THESE IF NEEDBE #if len(confkeys) == 0: # print(ut.dict_str(entry)) # print(entry.keys()) if len(confkeys) == 1: confkey = confkeys[0] old_confval = entry[confkey] # Remove curly braces old_confval = old_confval.replace('{', '').replace('}', '') if old_confval in ignore_confkey: print(ut.dict_str(entry)) continue new_confval_candiates = [] if old_confval.startswith('arXiv'): continue # for conf_title, patterns in constants_tex_fixes.CONFERENCE_TITLE_MAPS.items(): for conf in constants_tex_fixes.CONFERENCES: if conf.matches(old_confval): conf_title = conf.accro() if debug: print('old_confval = %r' % (old_confval, )) print('conf_title = %r' % (conf_title, )) new_confval = conf_title new_confval_candiates.append(new_confval) if len(new_confval_candiates) == 0: new_confval = None elif len(new_confval_candiates) == 1: new_confval = new_confval_candiates[0] else: assert False, 'double match' if new_confval is None: if key in key_list: unknown_confkeys.append(old_confval) #print(old_confval) else: # Overwrite old confval entry[confkey] = new_confval # Record info about types of conferneces true_confval = entry[confkey].replace('{', '').replace('}', '') # FIX ENTRIES THAT SHOULD BE CONFERENCES if true_confval in constants_tex_fixes.CONFERENCE_LIST: if entry[type_key] == 'inproceedings': pass #print(confkey) #print(ut.dict_str(entry)) elif entry[type_key] == 'article': entry['booktitle'] = entry['journal'] del entry['journal'] #print(ut.dict_str(entry)) elif entry[type_key] == 'incollection': pass else: raise AssertionError('UNKNOWN TYPE: %r' % (entry[type_key], )) if 'booktitle' not in entry: print('DOES NOT HAVE CORRECT CONFERENCE KEY') print(ut.dict_str(entry)) assert 'journal' not in entry, 'should not have journal' #print(entry['type']) entry[type_key] = 'inproceedings' # FIX ENTRIES THAT SHOULD BE JOURNALS if true_confval in constants_tex_fixes.JOURNAL_LIST: if entry[type_key] == 'article': pass elif entry[type_key] == 'inproceedings': pass #print(ut.dict_str(entry)) elif entry[type_key] == 'incollection': pass else: raise AssertionError('UNKNOWN TYPE: %r' % (entry['type'], )) if 'journal' not in entry: print('DOES NOT HAVE CORRECT CONFERENCE KEY') print(ut.dict_str(entry)) assert 'booktitle' not in entry, 'should not have booktitle' #print(entry['type']) #entry['type'] = 'article' #conftitle_to_types_hist[true_confval].append(entry['type']) elif len(confkeys) > 1: raise AssertionError('more than one confkey=%r' % (confkeys, )) # Fix Authors if 'author' in entry: authors = six.text_type(entry['author']) for truename, alias_list in constants_tex_fixes.AUTHOR_NAME_MAPS.items( ): pattern = six.text_type( ut.regex_or([ ut.util_regex.whole_word(alias) for alias in alias_list ])) authors = re.sub(pattern, six.text_type(truename), authors, flags=re.UNICODE) entry['author'] = authors """ article = journal inprocedings = converence paper """ #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()} #print(ut.dict_str(conftitle_to_types_set_hist)) print(ut.list_str(sorted(unknown_confkeys))) print('len(unknown_confkeys) = %r' % (len(unknown_confkeys), )) writer = BibTexWriter() writer.contents = ['comments', 'entries'] writer.indent = ' ' writer.order_entries_by = ('type', 'author', 'year') new_bibtex_str = bibtexparser.dumps(bib_database, writer) return new_bibtex_str