Example #1
0
def build_doc_preprocessor():
    """Build the preprocessor for feature extraction in each EDU of doc"""
    # TODO re-do in a better, more modular way
    token_filter = None  # token_filter_li2014
    word2clust = fetch_brown_clusters()[3200]  # EXPERIMENTAL
    docppp = DocumentPlusPreprocessor(token_filter=token_filter,
                                      word2clust=word2clust)
    return docppp.preprocess
Example #2
0
def build_doc_preprocessor():
    """Build the preprocessor for feature extraction in each EDU of doc"""
    # TODO re-do in a better, more modular way
    token_filter = None  # token_filter_li2014
    word2clust = fetch_brown_clusters()[3200]  # EXPERIMENTAL
    docppp = DocumentPlusPreprocessor(token_filter=token_filter,
                                      word2clust=word2clust)
    return docppp.preprocess
Example #3
0
def test_brown_clusters_acl2010():
    """Fetch and print Brown clusters for some discourse connectors"""

    clusters = fetch_brown_clusters()

    for nb_clusters, clust in sorted(clusters.items()):
        print('')
        print('Brown clusters with {} classes'.format(nb_clusters))
        print('----------------------------------')
        disc_conns = ['because', 'as', 'thus', 'so', 'then',
                      'according', 'including', 'and', 'or',
                      'for', 'before', 'after', 'while', 'by',
                      'without', 'despite', 'although', 'though',
                      'following', 'once', 'if', 'unless']
        for disc_conn in disc_conns:
            print('{}\t{}'.format(disc_conn, clust[disc_conn]))
Example #4
0
def test_brown_clusters_acl2010():
    """Fetch and print Brown clusters for some discourse connectors"""

    clusters = fetch_brown_clusters()

    for nb_clusters, clust in sorted(clusters.items()):
        print('')
        print('Brown clusters with {} classes'.format(nb_clusters))
        print('----------------------------------')
        disc_conns = [
            'because', 'as', 'thus', 'so', 'then', 'according', 'including',
            'and', 'or', 'for', 'before', 'after', 'while', 'by', 'without',
            'despite', 'although', 'though', 'following', 'once', 'if',
            'unless'
        ]
        for disc_conn in disc_conns:
            print('{}\t{}'.format(disc_conn, clust[disc_conn]))