Beispiel #1
0
def create_features(_words:list) -> list:
    _phi = [0] * len(ids)
    for word in _words:
        if not is_stopword(word):
            stemed = stemmer.stem(word)
            _phi[ids[word]] += 1
    return _phi
Beispiel #2
0
def preprocessor(sentence: str) -> str:
    res = []
    for word in sentence.split():
        if is_stopword(word) or len(word) < 1 or word == "--":
            continue
        res.append(stem(word))
    return " ".join(res)
Beispiel #3
0
def get_feature(sentence):
    '文からストップワードを除去してステミングした文を返す'
    phis = []
    for word in sentence.split():
        if is_stopword(word) or len(word) == 1 or word == '--':
            continue
        phis.append(stem(word))
    return ' '.join(phis)
Beispiel #4
0
def get_feature_base(sentence):
    stemmer = snowballstemmer.stemmer('english')
    words = sentence.split()
    result = []
    for word in words:
        if is_stopword(word):
            continue
        result.append(stemmer.stemWord(word))
    return ' '.join(result)
Beispiel #5
0
def feature_extraction(text: str):
    words = text.split(' ')
    sentence = []
    for word in words:
        if is_stopword(word) or len(word) == 1:
            continue
        sentence.append(stemmer.stemWord(word))

    return ' '.join(sentence)
Beispiel #6
0
def clean_sentence(line: str) -> str:
    sentence = []

    for word in line.split():
        word = word.rstrip('\n,.;:?!')
        word = stem(word)

        if is_stopword(word) or word == '':
            continue
        sentence.append(word)

    return ' '.join(sentence)
Beispiel #7
0
def stopword_removal(word_features="word_features.txt",
                     no_stopword_features="no_stopword_features.txt"):
    pbar = tqdm(total=10662)
    with open(word_features, "r",
              encoding="latin-1") as f, open(no_stopword_features,
                                             "w",
                                             encoding="latin-1") as fw:
        for line in f:
            label_words = line.rstrip().split(" ")
            label = label_words[0]
            words = label_words[1:]
            fw.write(label)
            for i, word in enumerate(words):
                if is_stopword(word):
                    continue
                fw.write(" " + word)
            pbar.update(1)
            fw.write("\n")
    pbar.close()
Beispiel #8
0
 def test_check_nlp(self):
     self.assertFalse(is_stopword('nlp'))
Beispiel #9
0
 def test_check_the(self):
     self.assertTrue(is_stopword('the'))
Beispiel #10
0
 def test_check_a(self):
     self.assertTrue(is_stopword('a'))
Beispiel #11
0
 def test_check_no_args(self):
     # 引数なしのときはraise TypeError
     with self.assertRaises(TypeError):
         is_stopword()
Beispiel #12
0
 def test_check_zerolength(self):
     with self.assertRaises(TypeError):
         is_stopword('')
Beispiel #13
0
 def test_check_physics(self):
     self.assertFalse(is_stopword('physics'))
Beispiel #14
0
def check(stem):
    if is_stopword(stem):
        return False
    if len(stem) == 1:
        return False
    return True
Beispiel #15
0
 def test_check_you(self):
     self.assertTrue(is_stopword('you'))
Beispiel #16
0
 def test_check_i(self):
     self.assertTrue(is_stopword('i'))
Beispiel #17
0
    def test_is_stopword(self):

        self.assertEqual(is_stopword('is'), True)
        self.assertEqual(is_stopword('me'), False)