Esempio n. 1
0
    def test(self):
        p = PreProcessing([], [], [])
        cts = machado.fileids()[:5]

        tokens = []
        for c in cts:
            text = machado.raw(c)
            tokens += p.clean_and_stem(text)

        bow, bow_features_names = p.build_bow(tokens)
        dist = np.sum(bow.toarray(), axis=0)
        tbow = {}
        for term, count in zip(bow_features_names, dist):
            tbow[term] = count

        import operator
        print sorted(tbow.items(), key=operator.itemgetter(1), reverse=True)

        texts = {}
        for c in cts:
            text = machado.raw(c)
            texts[c] = text

        terms = p.compute_tfidf(texts.values(), top_n=10, eliminate_zeros=True)
        print terms
Esempio n. 2
0
    def test_corpus(self):

        with open("../data/pt_BR/nnp") as f:
            nnp = [line.rstrip() for line in f.readlines()]
        with open("../data/pt_BR/terms") as f:
            terms = [line.rstrip() for line in f.readlines()]
        with open("../data/pt_BR/patterns") as f:
            patterns = [line.rstrip() for line in f.readlines()]

        data = LoadData(['../corpus/sel1.csv', '../corpus/sel2.csv']).load()
        p = PreProcessing(nnp, terms, patterns)

        tokens = []
        for d in data.values():
            tokens += p.clean_and_stem(d)

        bow, bow_features_names = p.build_bow(tokens)
        dist = np.sum(bow.toarray(), axis=0)
        tbow = {}
        for term, count in zip(bow_features_names, dist):
            tbow[term] = count

        import operator
        with open("bow", "w") as f:
            f.write(str(len(tbow)))
            f.write(
                str(
                    sorted(tbow.items(),
                           key=operator.itemgetter(1),
                           reverse=True)))

        terms = p.compute_tfidf(data.values(), eliminate_zeros=True)
        with open("terms", "w") as f:
            f.write(str(terms))
Esempio n. 3
0
    def test_should_build_bag_of_words(self):
        p = PreProcessing(["joao", "maria"], [], ["\d+", "nomeemp*"])

        text = "O técnico João foi até a casa da Maria (NOMEEMPRESA) e solucionou o problema. " \
               "Ele não foi solucionado? NomeempProd"
        tokens = p.clean(text)
        tokens = p.stem(tokens)

        bow, bfn = p.build_bow(tokens)
        self.assertEquals("(7, 6)", bow.shape.__str__())