def test_simple(self): self.assertEqual(split_name_words('Jean'), (['Jean'], [])) self.assertEqual(split_name_words('Jean Pierre'), (['Jean', 'Pierre'], [''])) self.assertEqual(split_name_words('Jean-Pierre'), (['Jean', 'Pierre'], ['-'])) self.assertEqual(split_name_words('J.-P.'), (['J', 'P'], ['-'])) self.assertEqual(split_name_words('J. P.'), (['J', 'P'], ['']))
def create_paper_plain_fingerprint(title, authors, year): title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('', title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) #if len(title) <= 16: if not '-' in title: buf += '-' + str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]), remove_diacritics(author[1])) # Initials of the given names are not used anymore in the fingerprints # initials = map(lambda x: x[0].lower(), split_words(author[0])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i in range(len(last_name_words)): if (last_name_words[i][0].isupper() or (i > 0 and last_name_separators[i - 1] == '-')): last_words.append(last_name_words[i]) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = map(ulower, last_words) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/' + fp return buf
def create_paper_plain_fingerprint(title, authors, year): title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('',title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) #if len(title) <= 16: if not '-' in title: buf += '-'+str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]),remove_diacritics(author[1])) # Initials of the given names are not used anymore in the fingerprints # initials = map(lambda x: x[0].lower(), split_words(author[0])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i in range(len(last_name_words)): if (last_name_words[i][0].isupper() or (i > 0 and last_name_separators[i-1] == '-')): last_words.append(last_name_words[i]) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = map(ulower, last_words) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/'+fp return buf
def test_flattened(self): self.assertEqual(split_name_words('JP.'), (['J', 'P'], ['-'])) self.assertEqual(split_name_words('Jp.'), (['J', 'P'], ['-']))
def test_unicode(self): self.assertEqual(split_name_words('Émilie'), (['Émilie'], [])) self.assertEqual(split_name_words('José'), (['José'], [])) self.assertEqual(split_name_words('José Alphonse'), (['José', 'Alphonse'], [''])) self.assertEqual(split_name_words('É. R.'), (['É', 'R'], ['']))
def test_awkward_spacing(self): self.assertEqual(split_name_words('J.P.'), (['J', 'P'], [''])) self.assertEqual(split_name_words('J. P.'), (['J', 'P'], [''])) self.assertEqual(split_name_words('Jean - Pierre'), (['Jean', 'Pierre'], ['-']))
def test_strange_characters(self): # TODO ? self.assertEqual(split_name_words('Jean*-Frederic'), (['Jean', 'Frederic'], ['-']))
def test_probably_not_flattened(self): self.assertEqual(split_name_words('Joseph.'), (['Joseph'], []))
def test_abbreviation(self): self.assertEqual(split_name_words('Ms.'), (['Ms.'], [])) self.assertEqual(split_name_words('St. Louis'), (['St.', 'Louis'], ['']))
def create_paper_plain_fingerprint(title, authors, year): """ Creates a robust summary of a bibliographic reference. This plain fingerprint should then be converted to an actual fingerprint by hashing it (so that the length remains constant). :param title: the title of the paper :param authors: the list of author names, represented as (first_name, last_name) pairs :param year: the year of publication of the paper >>> create_paper_plain_fingerprint(' It cleans whitespace And Case\\n',[('John','Doe')], 2015) u'it-cleans-whitespace-and-case/doe' >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015) u'html-tags-are-removed/doe' >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015) u'les-accents-sont-supprimes/doe' >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015) u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors' >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014) u'ambiguity-2014/doe' """ title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('',title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) #if len(title) <= 16: if not '-' in title: buf += '-'+str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]),remove_diacritics(author[1])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i in range(len(last_name_words)): if (last_name_words[i][0].isupper() or (i > 0 and last_name_separators[i-1] == '-')): last_words.append(last_name_words[i]) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = map(ulower, last_words) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/'+fp return buf
def create_paper_plain_fingerprint(title, authors, year): """ Creates a robust summary of a bibliographic reference. This plain fingerprint should then be converted to an actual fingerprint by hashing it (so that the length remains constant). :param title: the title of the paper :param authors: the list of author names, represented as (first_name, last_name) pairs :param year: the year of publication of the paper >>> create_paper_plain_fingerprint(' It cleans whitespace And Case\\n',[('John','Doe')], 2015) u'it-cleans-whitespace-and-case/doe' >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015) u'html-tags-are-removed/doe' >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015) u'les-accents-sont-supprimes/doe' >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015) u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors' >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014) u'ambiguity-2014/doe' """ title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('', title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) # if len(title) <= 16: if not '-' in title: buf += '-' + str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]), remove_diacritics(author[1])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i, w in enumerate(last_name_words): if (w[0].isupper() or (i > 0 and last_name_separators[i - 1] == '-')): last_words.append(w) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = map(ulower, last_words) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/' + fp return buf