Exemple #1
0
def test_title_word_normalisation():
    tests = [
        ("word", "word"),
        ("tube", "tube"),
        ("light", "light"),
        ("sense", "sense"),
        ("domain", "domain"),
        ("jacobi", "jacobi"),
        ("chance", "chance"),
        ("12345", "12345"),
        ("12345-7", "12345-7"),
        ("123456", "123456"),
        ("123456-7", "123456"),
        ("scaling", "scaling"),
        ("strange", "strange"),
        ("semantic", "semant"),
        ("research", "researc"),
        ("citation", "citatio"),
        ("searching", "search"),
        ("retrieval", "retriev"),
        ("iostreams", "iostrea"),
        ("similarity", "similar"),
        ("co-occurrence", "co-occ"),
    ]
    test_framework.test_and_compare(tests, normalise_title_word, "Title word")
Exemple #2
0
def test_title_word_normalisation():
  tests = [
    ("word", "word"),
    ("tube", "tube"),
    ("light", "light"),
    ("sense", "sense"),
    ("domain", "domain"),
    ("jacobi", "jacobi"),
    ("chance", "chance"),
    ("12345", "12345"),
    ("12345-7", "12345-7"),
    ("123456", "123456"),
    ("123456-7", "123456"),
    ("scaling", "scaling"),
    ("strange", "strange"),
    ("semantic", "semant"),
    ("research", "researc"),
    ("citation", "citatio"),
    ("searching", "search"),
    ("retrieval", "retriev"),
    ("iostreams", "iostrea"),
    ("similarity", "similar"),
    ("co-occurrence", "co-occ"),
  ]
  test_framework.test_and_compare(tests, normalise_title_word, "Title word")
Exemple #3
0
def test_normalise_first_N_words_of_title():
  def normalise_first_3_title_words(title):
    N = 3
    return normalise_first_N_words_of_title(title, N)

  tests = [
    ("Experiments in word domain disambiguation for parallel texts",
        "experim-word-domain"),
    ("The role of domain information in word sense disambiguation",
        "role-domain-inform"),
    ("Automatic retrieval and clustering of similar words",
        "automat-retriev-cluster"),
    ("A general framework for distributional similarity",
        "general-framew-distrib"),
    ("A maximum entropy part-of-speech tagger",
        "maximum-entropy-part-of"),
    ("Domain-specific sense distributions and predominant sense acquisition",
        "domain-sense-distrib"),
    ("Sussx: WSD using automatically acquired predominant senses",
        "sussx-wsd-automat"),
    ("Scaling context space",
        "scaling-context-space"),
    ("Co-occurrence Retrieval: A Flexible Framework for Lexical Distributional Similarity",
        "co-occ-retriev-flexibl"),
    ("Discovering corpus-specific word senses",
        "discov-corpus-word"),
    ("Using domain information for word sense disambiguation",
        "domain-inform-word"),
    ("Web-scale distributional similarity and entity set expansion",
        "web-distrib-similar"),
    ("From predicting predominant senses to local context for word sense disambiguation",
        "predict-predom-senses"),
    ("Large-Scale Syntactic Processing: Parsing the Web Final Report of the 2009 JHU CLSP Workshop",
        "large-syntact-process"),
    ("Robust, applied morphological generation",
        "robust-applied-morphol"),
    ("From distributional to semantic similarity",
        "distrib-semant-similar"),
    ("TnT: a statistical part-of-speech tagger",
        "tnt-statist-part-of"),
    ("A best-first probabilistic shift-reduce parser",
        "best-first-probab"),
    ("Using automatically acquired predominant senses for word sense disambiguation",
        "automat-acquir-predom"),
    ("Evaluating WordNet-based Measures of Lexical Semantic Relatedness",
        "evaluat-wordnet-measur"),
  ]
  test_framework.test_and_compare(tests, normalise_first_3_title_words, "First 3 title words")
Exemple #4
0
def test_author_name_normalisation():
    tests = [
        ("Kay", "kay"),
        ("Mr.E", "mre"),
        ("Wang", "wang"),
        ("Curran", "curran"),
        ("Fekete", "fekete"),
        ("Manning", "manning"),
        ("Nothman", "nothman"),
        ("O'Keefe", "okeefe"),
        ("Ringland", "ringlan"),
        ("Koprinska", "koprins"),
        ("Kummerfeld", "kummerf"),
        ("Balasuriya", "balasur"),
        (u"M\u00FCller", u'muller'),  # Contains a non-ASCII character
        ("Durrant-White", "durrant"),
    ]
    test_framework.test_and_compare(tests, normalise_lastname, "Author name")
Exemple #5
0
def test_author_name_normalisation():
  tests = [
    ("Kay", "kay"),
    ("Mr.E", "mre"),
    ("Wang", "wang"),
    ("Curran", "curran"),
    ("Fekete", "fekete"),
    ("Manning", "manning"),
    ("Nothman", "nothman"),
    ("O'Keefe", "okeefe"),
    ("Ringland", "ringlan"),
    ("Koprinska", "koprins"),
    ("Kummerfeld", "kummerf"),
    ("Balasuriya", "balasur"),
    (u"M\u00FCller", u'muller'),  # Contains a non-ASCII character
    ("Durrant-White", "durrant"),
  ]
  test_framework.test_and_compare(tests, normalise_lastname, "Author name")
Exemple #6
0
def test_normalise_first_N_words_of_title():
    def normalise_first_3_title_words(title):
        N = 3
        return normalise_first_N_words_of_title(title, N)

    tests = [
        ("Experiments in word domain disambiguation for parallel texts",
         "experim-word-domain"),
        ("The role of domain information in word sense disambiguation",
         "role-domain-inform"),
        ("Automatic retrieval and clustering of similar words",
         "automat-retriev-cluster"),
        ("A general framework for distributional similarity",
         "general-framew-distrib"),
        ("A maximum entropy part-of-speech tagger", "maximum-entropy-part-of"),
        ("Domain-specific sense distributions and predominant sense acquisition",
         "domain-sense-distrib"),
        ("Sussx: WSD using automatically acquired predominant senses",
         "sussx-wsd-automat"),
        ("Scaling context space", "scaling-context-space"),
        ("Co-occurrence Retrieval: A Flexible Framework for Lexical Distributional Similarity",
         "co-occ-retriev-flexibl"),
        ("Discovering corpus-specific word senses", "discov-corpus-word"),
        ("Using domain information for word sense disambiguation",
         "domain-inform-word"),
        ("Web-scale distributional similarity and entity set expansion",
         "web-distrib-similar"),
        ("From predicting predominant senses to local context for word sense disambiguation",
         "predict-predom-senses"),
        ("Large-Scale Syntactic Processing: Parsing the Web Final Report of the 2009 JHU CLSP Workshop",
         "large-syntact-process"),
        ("Robust, applied morphological generation", "robust-applied-morphol"),
        ("From distributional to semantic similarity",
         "distrib-semant-similar"),
        ("TnT: a statistical part-of-speech tagger", "tnt-statist-part-of"),
        ("A best-first probabilistic shift-reduce parser",
         "best-first-probab"),
        ("Using automatically acquired predominant senses for word sense disambiguation",
         "automat-acquir-predom"),
        ("Evaluating WordNet-based Measures of Lexical Semantic Relatedness",
         "evaluat-wordnet-measur"),
    ]
    test_framework.test_and_compare(tests, normalise_first_3_title_words,
                                    "First 3 title words")