def dutch():
    from collective.classification.data.downloader import\
        downloadNLTKAlpinoCorpus
    downloadNLTKAlpinoCorpus()
    from nltk.corpus import alpino
    alpino_sents = alpino.tagged_sents(simplify_tags=True)
    tagger = BrillTrigramTagger()
    tagger.train(alpino_sents)
    dump(tagger.tagger, "dutch_tagger.pickle")
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='universal'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='universal'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='universal'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='universal'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='universal'),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(tagset='universal'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
     lambda: mac_morpho.tagged_sents(tagset='universal'),
 'Spanish: CESS-ESP Corpus (simplified)':
Exemple #3
0
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(tagset='simple'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
     lambda: mac_morpho.tagged_sents(tagset='simple'),
 'Spanish: CESS-ESP Corpus (simplified)':
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(simplify_tags=True),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
     lambda: mac_morpho.tagged_sents(simplify_tags=True),
 'Spanish: CESS-ESP Corpus (simplified)':
Exemple #5
0
        categories=["news", "editorial", "reviews"], tagset="simple"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results