Example #1
0
def main():
    # modeling
    vectorizer = CountVectorizer(analyzer=space_analyzer)
    vectorizer.fit(corpus)
    print(vectorizer.get_feature_names())
    print('')

    print('[space vectorizer]')
    check(vectorizer)

    print('[comma vectorizer]')
    vectorizer.analyzer = comma_analyzer
    check(vectorizer)

    print('[space vectorizer]')
    vectorizer.analyzer = space_analyzer
    check(vectorizer)

    return 0
Example #2
0
def build_matrix_count(bmt__document_list, input_type='filename', with_analyzer=False, amr_tool=None):
    """

    :param input_type:
    :param bmt__document_list:
    :return:
    """

    vectorizer = CountVectorizer(input=input_type, dtype=np.float64)

    analyzer = vectorizer.build_analyzer()

    def stemm(doc):
        stemmer = PorterStemmer()
        return (stemmer.stem(word) for word in analyzer(doc))

    def nodes(doc):

        graph_str = amr_tool.amr_graph_reader(doc)

        graph_list = amr_tool.parse_graph(graph_str)

        _nodes = []

        for graph in graph_list:

            _nodes.extend(graph.nodes)

        return _nodes

    if with_analyzer:

        vectorizer.analyzer = stemm
    else:

        vectorizer.analyzer = nodes

    term_document_matrix = vectorizer.fit_transform(bmt__document_list)

    vocabulary = vectorizer.vocabulary_

    return term_document_matrix, vocabulary
Example #3
0
def generate_bag_of_words(bdtm__document_list, input_type='filename'):
    """

    :param input_type:
    :param bdtm__document_list:
    :return:
    """

    vectorizer = CountVectorizer(input=input_type)

    analyzer = vectorizer.build_analyzer()

    def stemm(doc):
        stemmer = PorterStemmer()
        return (stemmer.stem(word) for word in analyzer(doc))

    vectorizer.analyzer = stemm

    vectorizer.fit(bdtm__document_list)

    vocabulary = vectorizer.vocabulary_

    return vocabulary
Example #4
0
    def generate_bag_of_words(self, generate_bow__path_list):
        """

        :param generate_bow__path_list:
        :return:
        """

        vectorizer = CountVectorizer(input='filename')

        analyzer = vectorizer.build_analyzer()

        def stemm(doc):
            stemmer = PorterStemmer()
            return (stemmer.stem(word) for word in analyzer(doc))

        vectorizer.analyzer = stemm

        term_document_matrix = vectorizer.fit_transform(
            generate_bow__path_list)

        vocabulary = vectorizer.vocabulary_

        return term_document_matrix, vocabulary