def test_title_word_normalisation(): tests = [ ("word", "word"), ("tube", "tube"), ("light", "light"), ("sense", "sense"), ("domain", "domain"), ("jacobi", "jacobi"), ("chance", "chance"), ("12345", "12345"), ("12345-7", "12345-7"), ("123456", "123456"), ("123456-7", "123456"), ("scaling", "scaling"), ("strange", "strange"), ("semantic", "semant"), ("research", "researc"), ("citation", "citatio"), ("searching", "search"), ("retrieval", "retriev"), ("iostreams", "iostrea"), ("similarity", "similar"), ("co-occurrence", "co-occ"), ] test_framework.test_and_compare(tests, normalise_title_word, "Title word")
def test_normalise_first_N_words_of_title(): def normalise_first_3_title_words(title): N = 3 return normalise_first_N_words_of_title(title, N) tests = [ ("Experiments in word domain disambiguation for parallel texts", "experim-word-domain"), ("The role of domain information in word sense disambiguation", "role-domain-inform"), ("Automatic retrieval and clustering of similar words", "automat-retriev-cluster"), ("A general framework for distributional similarity", "general-framew-distrib"), ("A maximum entropy part-of-speech tagger", "maximum-entropy-part-of"), ("Domain-specific sense distributions and predominant sense acquisition", "domain-sense-distrib"), ("Sussx: WSD using automatically acquired predominant senses", "sussx-wsd-automat"), ("Scaling context space", "scaling-context-space"), ("Co-occurrence Retrieval: A Flexible Framework for Lexical Distributional Similarity", "co-occ-retriev-flexibl"), ("Discovering corpus-specific word senses", "discov-corpus-word"), ("Using domain information for word sense disambiguation", "domain-inform-word"), ("Web-scale distributional similarity and entity set expansion", "web-distrib-similar"), ("From predicting predominant senses to local context for word sense disambiguation", "predict-predom-senses"), ("Large-Scale Syntactic Processing: Parsing the Web Final Report of the 2009 JHU CLSP Workshop", "large-syntact-process"), ("Robust, applied morphological generation", "robust-applied-morphol"), ("From distributional to semantic similarity", "distrib-semant-similar"), ("TnT: a statistical part-of-speech tagger", "tnt-statist-part-of"), ("A best-first probabilistic shift-reduce parser", "best-first-probab"), ("Using automatically acquired predominant senses for word sense disambiguation", "automat-acquir-predom"), ("Evaluating WordNet-based Measures of Lexical Semantic Relatedness", "evaluat-wordnet-measur"), ] test_framework.test_and_compare(tests, normalise_first_3_title_words, "First 3 title words")
def test_author_name_normalisation(): tests = [ ("Kay", "kay"), ("Mr.E", "mre"), ("Wang", "wang"), ("Curran", "curran"), ("Fekete", "fekete"), ("Manning", "manning"), ("Nothman", "nothman"), ("O'Keefe", "okeefe"), ("Ringland", "ringlan"), ("Koprinska", "koprins"), ("Kummerfeld", "kummerf"), ("Balasuriya", "balasur"), (u"M\u00FCller", u'muller'), # Contains a non-ASCII character ("Durrant-White", "durrant"), ] test_framework.test_and_compare(tests, normalise_lastname, "Author name")