Example #1
0
 def test_simple(self):
     self.assertEqual(split_name_words('Jean'), (['Jean'], []))
     self.assertEqual(split_name_words('Jean Pierre'),
                      (['Jean', 'Pierre'], ['']))
     self.assertEqual(split_name_words('Jean-Pierre'),
                      (['Jean', 'Pierre'], ['-']))
     self.assertEqual(split_name_words('J.-P.'), (['J', 'P'], ['-']))
     self.assertEqual(split_name_words('J. P.'), (['J', 'P'], ['']))
Example #2
0
 def test_simple(self):
     self.assertEqual(split_name_words('Jean'), (['Jean'], []))
     self.assertEqual(split_name_words('Jean Pierre'),
                      (['Jean', 'Pierre'], ['']))
     self.assertEqual(split_name_words('Jean-Pierre'),
                      (['Jean', 'Pierre'], ['-']))
     self.assertEqual(split_name_words('J.-P.'), (['J', 'P'], ['-']))
     self.assertEqual(split_name_words('J. P.'), (['J', 'P'], ['']))
Example #3
0
def create_paper_plain_fingerprint(title, authors, year):
    title = kill_html(title)
    title = remove_diacritics(title).lower()
    title = stripped_chars.sub('', title)
    title = title.strip()
    title = re.sub('[ -]+', '-', title)
    buf = title

    # If the title is long enough, we return the fingerprint as is
    if len(buf) > 50:
        return buf

    # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases)
    #if len(title) <= 16:
    if not '-' in title:
        buf += '-' + str(year)

    author_names_list = []
    for author in authors:
        if not author:
            continue
        author = (remove_diacritics(author[0]), remove_diacritics(author[1]))
        # Initials of the given names are not used anymore in the fingerprints
        # initials = map(lambda x: x[0].lower(), split_words(author[0]))

        # Last name, without the small words such as "van", "der", "de"…
        last_name_words, last_name_separators = split_name_words(author[1])
        last_words = []
        for i in range(len(last_name_words)):
            if (last_name_words[i][0].isupper()
                    or (i > 0 and last_name_separators[i - 1] == '-')):
                last_words.append(last_name_words[i])

        # If no word was uppercased, fall back on all the words
        if not last_words:
            last_words = last_name_words

        # Lowercase
        last_words = map(ulower, last_words)
        fp = '-'.join(last_words)
        author_names_list.append(fp)

    author_names_list.sort()
    for fp in author_names_list:
        buf += '/' + fp

    return buf
Example #4
0
def create_paper_plain_fingerprint(title, authors, year):
    title = kill_html(title)
    title = remove_diacritics(title).lower()
    title = stripped_chars.sub('',title)
    title = title.strip()
    title = re.sub('[ -]+', '-', title)
    buf = title

    # If the title is long enough, we return the fingerprint as is
    if len(buf) > 50:
        return buf
    
    # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases)
    #if len(title) <= 16:
    if not '-' in title:
        buf += '-'+str(year)

    author_names_list = []
    for author in authors:
        if not author:
            continue
        author = (remove_diacritics(author[0]),remove_diacritics(author[1]))
        # Initials of the given names are not used anymore in the fingerprints
        # initials = map(lambda x: x[0].lower(), split_words(author[0]))

        # Last name, without the small words such as "van", "der", "de"…
        last_name_words, last_name_separators = split_name_words(author[1])
        last_words = []
        for i in range(len(last_name_words)):
            if (last_name_words[i][0].isupper() or
                (i > 0 and last_name_separators[i-1] == '-')):
                last_words.append(last_name_words[i])
             
        # If no word was uppercased, fall back on all the words
        if not last_words:
            last_words = last_name_words

        # Lowercase
        last_words = map(ulower, last_words)
        fp = '-'.join(last_words)
        author_names_list.append(fp)

    author_names_list.sort()
    for fp in author_names_list:
        buf += '/'+fp

    return buf
Example #5
0
 def test_flattened(self):
     self.assertEqual(split_name_words('JP.'), (['J', 'P'], ['-']))
     self.assertEqual(split_name_words('Jp.'), (['J', 'P'], ['-']))
Example #6
0
 def test_unicode(self):
     self.assertEqual(split_name_words('Émilie'), (['Émilie'], []))
     self.assertEqual(split_name_words('José'), (['José'], []))
     self.assertEqual(split_name_words('José Alphonse'),
                      (['José', 'Alphonse'], ['']))
     self.assertEqual(split_name_words('É. R.'), (['É', 'R'], ['']))
Example #7
0
 def test_awkward_spacing(self):
     self.assertEqual(split_name_words('J.P.'), (['J', 'P'], ['']))
     self.assertEqual(split_name_words('J.  P.'), (['J', 'P'], ['']))
     self.assertEqual(split_name_words('Jean - Pierre'),
                      (['Jean', 'Pierre'], ['-']))
Example #8
0
 def test_strange_characters(self):
     # TODO ?
     self.assertEqual(split_name_words('Jean*-Frederic'),
                      (['Jean', 'Frederic'], ['-']))
Example #9
0
 def test_probably_not_flattened(self):
     self.assertEqual(split_name_words('Joseph.'), (['Joseph'], []))
Example #10
0
 def test_abbreviation(self):
     self.assertEqual(split_name_words('Ms.'), (['Ms.'], []))
     self.assertEqual(split_name_words('St. Louis'),
                      (['St.', 'Louis'], ['']))
Example #11
0
def create_paper_plain_fingerprint(title, authors, year):
    """
    Creates a robust summary of a bibliographic reference.
    This plain fingerprint should then be converted to an
    actual fingerprint by hashing it (so that the length remains
    constant).

    :param title: the title of the paper
    :param authors: the list of author names, represented
        as (first_name, last_name) pairs
    :param year: the year of publication of the paper

    >>> create_paper_plain_fingerprint(' It  cleans whitespace And Case\\n',[('John','Doe')], 2015)
    u'it-cleans-whitespace-and-case/doe'
    >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015)
    u'html-tags-are-removed/doe'
    >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015)
    u'les-accents-sont-supprimes/doe'
    >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015)
    u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors'
    >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014)
    u'ambiguity-2014/doe'
    """
    title = kill_html(title)
    title = remove_diacritics(title).lower()
    title = stripped_chars.sub('',title)
    title = title.strip()
    title = re.sub('[ -]+', '-', title)
    buf = title

    # If the title is long enough, we return the fingerprint as is
    if len(buf) > 50:
        return buf
    
    # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases)
    #if len(title) <= 16:
    if not '-' in title:
        buf += '-'+str(year)

    author_names_list = []
    for author in authors:
        if not author:
            continue
        author = (remove_diacritics(author[0]),remove_diacritics(author[1]))

        # Last name, without the small words such as "van", "der", "de"…
        last_name_words, last_name_separators = split_name_words(author[1])
        last_words = []
        for i in range(len(last_name_words)):
            if (last_name_words[i][0].isupper() or
                (i > 0 and last_name_separators[i-1] == '-')):
                last_words.append(last_name_words[i])
             
        # If no word was uppercased, fall back on all the words
        if not last_words:
            last_words = last_name_words

        # Lowercase
        last_words = map(ulower, last_words)
        fp = '-'.join(last_words)
        author_names_list.append(fp)

    author_names_list.sort()
    for fp in author_names_list:
        buf += '/'+fp

    return buf
Example #12
0
 def test_unicode(self):
     self.assertEqual(split_name_words('Émilie'), (['Émilie'], []))
     self.assertEqual(split_name_words('José'), (['José'], []))
     self.assertEqual(split_name_words('José Alphonse'),
                      (['José', 'Alphonse'], ['']))
     self.assertEqual(split_name_words('É. R.'), (['É', 'R'], ['']))
Example #13
0
 def test_awkward_spacing(self):
     self.assertEqual(split_name_words('J.P.'), (['J', 'P'], ['']))
     self.assertEqual(split_name_words('J.  P.'), (['J', 'P'], ['']))
     self.assertEqual(split_name_words('Jean - Pierre'),
                      (['Jean', 'Pierre'], ['-']))
Example #14
0
 def test_strange_characters(self):
     # TODO ?
     self.assertEqual(split_name_words('Jean*-Frederic'),
                      (['Jean', 'Frederic'], ['-']))
Example #15
0
 def test_probably_not_flattened(self):
     self.assertEqual(split_name_words('Joseph.'), (['Joseph'], []))
Example #16
0
 def test_abbreviation(self):
     self.assertEqual(split_name_words('Ms.'), (['Ms.'], []))
     self.assertEqual(split_name_words('St. Louis'),
                      (['St.', 'Louis'], ['']))
Example #17
0
def create_paper_plain_fingerprint(title, authors, year):
    """
    Creates a robust summary of a bibliographic reference.
    This plain fingerprint should then be converted to an
    actual fingerprint by hashing it (so that the length remains
    constant).

    :param title: the title of the paper
    :param authors: the list of author names, represented
        as (first_name, last_name) pairs
    :param year: the year of publication of the paper

    >>> create_paper_plain_fingerprint(' It  cleans whitespace And Case\\n',[('John','Doe')], 2015)
    u'it-cleans-whitespace-and-case/doe'
    >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015)
    u'html-tags-are-removed/doe'
    >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015)
    u'les-accents-sont-supprimes/doe'
    >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015)
    u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors'
    >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014)
    u'ambiguity-2014/doe'
    """
    title = kill_html(title)
    title = remove_diacritics(title).lower()
    title = stripped_chars.sub('', title)
    title = title.strip()
    title = re.sub('[ -]+', '-', title)
    buf = title

    # If the title is long enough, we return the fingerprint as is
    if len(buf) > 50:
        return buf

    # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases)
    # if len(title) <= 16:
    if not '-' in title:
        buf += '-' + str(year)

    author_names_list = []
    for author in authors:
        if not author:
            continue
        author = (remove_diacritics(author[0]), remove_diacritics(author[1]))

        # Last name, without the small words such as "van", "der", "de"…
        last_name_words, last_name_separators = split_name_words(author[1])
        last_words = []
        for i, w in enumerate(last_name_words):
            if (w[0].isupper()
                    or (i > 0 and last_name_separators[i - 1] == '-')):
                last_words.append(w)

        # If no word was uppercased, fall back on all the words
        if not last_words:
            last_words = last_name_words

        # Lowercase
        last_words = map(ulower, last_words)
        fp = '-'.join(last_words)
        author_names_list.append(fp)

    author_names_list.sort()
    for fp in author_names_list:
        buf += '/' + fp

    return buf
Example #18
0
 def test_flattened(self):
     self.assertEqual(split_name_words('JP.'), (['J', 'P'], ['-']))
     self.assertEqual(split_name_words('Jp.'), (['J', 'P'], ['-']))