def lowerupper(s): parts = [x for x in relu.split(s) if x] lower = [] upper = [] for (i, x) in enumerate(parts): if not recapstart.match(undiacritic(x)): lower.append(x) else: upper = parts[i:] break return (lower, upper)
def keyid(fields, fd={}, ti=2, infinity=float('inf')): if not fields.has_key('author'): if not fields.has_key('editor'): values = ''.join(v for f, v in bibord_iteritems(fields) if f != 'glottolog_ref_id') return '__missingcontrib__' + reokkey.sub('_', values.lower()) else: astring = fields['editor'] else: astring = fields['author'] authors = pauthor(astring) if len(authors) != len(astring.split(' and ')): print "Unparsed author in", authors print " ", astring, astring.split(' and ') print fields['title'] ak = [ undiacritic(x) for x in sorted( lastnamekey(a['lastname']) for a in authors) ] yk = pyear(fields.get('year', '[nd]'))[:4] tks = wrds(fields.get("title", "no.title")) #takeuntil : # select the (leftmost) two least frequent words from the title types = uniqued(w for w in tks if rewrdtok.match(w)) tk = nsmallest(ti, types, key=lambda w: fd.get(w, infinity)) # put them back into the title order (i.e. 'spam eggs' != 'eggs spam') order = {w: i for i, w in enumerate(types)} tk.sort(key=lambda w: order[w]) if fields.has_key( 'volume') and not fields.has_key('journal') and not fields.has_key( 'booktitle') and not fields.has_key('series'): vk = roman(fields['volume']) else: vk = '' if fields.has_key('extra_hash'): yk = yk + fields['extra_hash'] key = '-'.join(ak) + "_" + '-'.join(tk) + vk + yk return reokkey.sub("", key.lower())
def keyid(fields, fd={}, ti=2, infinity=float('inf')): if not fields.has_key('author'): if not fields.has_key('editor'): values = ''.join(v for f, v in bibord_iteritems(fields) if f != 'glottolog_ref_id') return '__missingcontrib__' + reokkey.sub('_', values.lower()) else: astring = fields['editor'] else: astring = fields['author'] authors = pauthor(astring) if len(authors) != len(astring.split(' and ')): print "Unparsed author in", authors print " ", astring, astring.split(' and ') print fields['title'] ak = [undiacritic(x) for x in sorted(lastnamekey(a['lastname']) for a in authors)] yk = pyear(fields.get('year', '[nd]'))[:4] tks = wrds(fields.get("title", "no.title")) #takeuntil : # select the (leftmost) two least frequent words from the title types = uniqued(w for w in tks if rewrdtok.match(w)) tk = nsmallest(ti, types, key=lambda w: fd.get(w, infinity)) # put them back into the title order (i.e. 'spam eggs' != 'eggs spam') order = {w: i for i, w in enumerate(types)} tk.sort(key=lambda w: order[w]) if fields.has_key('volume') and not fields.has_key('journal') and not fields.has_key('booktitle') and not fields.has_key('series'): vk = roman(fields['volume']) else: vk = '' if fields.has_key('extra_hash'): yk = yk + fields['extra_hash'] key = '-'.join(ak) + "_" + '-'.join(tk) + vk + yk return reokkey.sub("", key.lower())
def wrds(txt): txt = undiacritic(txt.lower()) txt = txt.replace("'", "").replace('"', "") return [x for x in resplittit.split(txt) if x]