Beispiel #1
0
 def test_sentence(self):
     assert list(
         textparser.word_tokenize(
             text='Life is about making an impact, not making an income.',
         )) == [('life', ), ('is', ), ('about', ), ('making', ), ('an', ),
                ('impact', ), ('not', ), ('making', ), ('an', ),
                ('income', )]
Beispiel #2
0
 def test_ignores_stopwords(self):
     assert list(
         textparser.word_tokenize(
             text='The first rule of python is',
             stopwords=set(['the', 'of', 'is']),
             min_length=1,
         )) == [('first', ), ('rule', ), ('python', )]
 def test_sentence(self):
     assert list(textparser.word_tokenize(
         text='Life is about making an impact, not making an income.',
     )) == [
         ('life', ), ('is', ), ('about', ),
         ('making', ), ('an', ), ('impact', ),
         ('not', ), ('making', ), ('an', ),
         ('income', )
     ]
    def get_data_from_collection(self):
        if os.path.exists(self.env_dir + self.dir) and len(
                os.listdir(self.env_dir + self.dir)) > 0 and len(
                    self.collection) > 0:
            index = hashedindex.HashedIndex()
            doc_count = 0

            with io.open(self.env_dir + self.dir + self.collection,
                         'r',
                         encoding='utf8') as fp:
                for line in fp.readlines():
                    for term in textparser.word_tokenize(line,
                                                         min_length=2,
                                                         ignore_numeric=True):
                        time.sleep(1)
                        index.add_term_occurrence(
                            term, self.collection + "/line-" + str(doc_count))

                    self.docnames.append(self.collection + "/line-" +
                                         str(doc_count))

                    doc_count = doc_count + 1

            # Esto es una PoC para ver si es que se genera efectivamente una matriz de 1's y 0's con las incidencias
            for doc in self.docnames:
                aux_doc = []
                for term in index.terms():
                    if round(index.get_term_frequency(term, doc)) > 0:
                        aux_doc.append(1)
                    else:
                        aux_doc.append(0)

                self.matrix.append(aux_doc)

            self.matrix = np.matrix(self.matrix)

            # Esto es para crear el array de términos
            for term in index.terms():
                self.terms.append(re.sub("(\(\'|\'\,\))", "", str(term)))

        else:
            print("Attempting to create '{}' into {}.".format(
                self.dir, self.env_dir))

            if not os.path.exists(self.env_dir + self.dir):
                os.mkdir(self.env_dir + self.dir, mode=777)
                print(
                    "The input folder, '{}', was created successfully in {}.".
                    format(self.dir, self.env_dir))

            else:
                print("The input folder, '{}', is empty in {}.".format(
                    self.dir, self.env_dir))

        return self.matrix, self.docnames, self.terms
    def get_data_from_input(self):
        if os.path.exists(self.env_dir + self.dir) and len(
                os.listdir(self.env_dir + self.dir)) > 0:
            self.docnames = [
                f for f in listdir(self.env_dir + self.dir)
                if isfile(join(self.env_dir + self.dir, f))
            ]
            index = hashedindex.HashedIndex()

            for doc in self.docnames:
                with io.open(self.env_dir + self.dir + doc,
                             'r',
                             encoding='utf8') as fp:
                    text = re.sub('(\t\n|\t|\n|_)', " ", fp.read())

                    for term in textparser.word_tokenize(text,
                                                         min_length=2,
                                                         ignore_numeric=True):
                        index.add_term_occurrence(term, doc)

            # Esto es una PoC para ver si es que se genera efectivamente una matriz de 1's y 0's con las incidencias
            for doc in self.docnames:
                aux_doc = []
                for term in index.terms():
                    if round(index.get_term_frequency(term, doc)) > 0:
                        aux_doc.append(1)
                    else:
                        aux_doc.append(0)

                self.matrix.append(aux_doc)

            self.matrix = np.matrix(self.matrix)

            # Esto es para crear el array de términos
            for term in index.terms():
                self.terms.append(re.sub("(\(\'|\'\,\))", "", str(term)))

        else:
            print("Attempting to create '{}' into {}.".format(
                self.dir, self.env_dir))

            if not os.path.exists(self.env_dir + self.dir):
                os.mkdir(self.env_dir + self.dir, mode=777)
                print(
                    "The input folder, '{}', was created successfully in {}.".
                    format(self.dir, self.env_dir))

            else:
                print("The input folder, '{}', is empty in {}.".format(
                    self.dir, self.env_dir))

        return self.matrix, self.docnames, self.terms
    def preprocess_tweet(self, text):
        text = text.lower()
        if not self.config.remove_hash_tags_and_mentions:
            stripped = re.sub(r'\burl\b', '', text)
        else:
            stripped = re.sub(r'\burl\b', '', text)
            stripped = re.sub(r'(\b|\s)([@#][\w_-]+)', '', stripped)

        tokens = list(
            map(
                lambda x: x[0],
                textparser.word_tokenize(
                    stripped,
                    stopwords.words('english')
                    if self.config.remove_stopwords else [])))

        return tokens
Beispiel #7
0
 def test_ngrams(self):
     assert list(
         textparser.word_tokenize(
             text='foo bar bomb blar',
             ngrams=2,
         )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]
Beispiel #8
0
 def test_ignores_numeric(self):
     assert list(textparser.word_tokenize(text='one two 3 four', )) == [
         ('one', ), ('two', ), ('four', )
     ]
Beispiel #9
0
 def test_min_length(self):
     assert list(
         textparser.word_tokenize(
             text='one for the money two for the go',
             min_length=4,
         )) == [('money', )]
Beispiel #10
0
 def test_splits_punctuation(self):
     assert list(textparser.word_tokenize(text='first. second', )) == [
         ('first', ), ('second', )
     ]
 def test_ngrams(self):
     assert list(textparser.word_tokenize(
         text='foo bar bomb blar',
         ngrams=2,
     )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]
 def test_ignores_numeric(self):
     assert list(textparser.word_tokenize(
         text='one two 3 four',
     )) == [('one', ), ('two', ), ('four', )]
 def test_min_length(self):
     assert list(textparser.word_tokenize(
         text='one for the money two for the go',
         min_length=4,
     )) == [('money', )]
 def test_ignores_stopwords(self):
     assert list(textparser.word_tokenize(
         text='The first rule of python is',
         stopwords=set(['the', 'of', 'is']),
         min_length=1,
     )) == [('first', ), ('rule', ), ('python', )]
 def test_splits_punctuation(self):
     assert list(textparser.word_tokenize(
         text='first. second',
     )) == [('first', ), ('second', )]