def __init__(self, directory): ''' directory: dataset directory as string stemmer: Potter's Stemmer for stemming and tokenizing from nltk tf_vectorizer: object to vectorize datas which appears at least 15 documents for featuring ''' self.directory = directory self.stemmer = Stemmer()
def test_stemming(test_file, lexicon_file): stemmer = Stemmer(lexicon_file) with open(test_file) as f: for test in yaml.load(f): lemma = strip_length(test.pop("lemma")) test_length = test.pop("test_length", True) location = test.pop("location", "") for parse, form in test.items(): stemmer.stem(location, lemma, parse, form, test_length) stemmer.counter.results()
class Dataset(object): def __init__(self, directory): ''' directory: dataset directory as string stemmer: Potter's Stemmer for stemming and tokenizing from nltk tf_vectorizer: object to vectorize datas which appears at least 15 documents for featuring ''' self.directory = directory self.stemmer = Stemmer() def get_set(self, train_dir): X = [] Y = [] os.chdir(self.directory + train_dir) for root, dirs, files in os.walk('.'): for file in files: f = open(file, 'r', encoding='iso-8859-9') data = [] for line in f: if not line.startswith(("<ANCH>", "<P>")): continue else: data += self.stemmer.stem_text(line) X.append(data) #if y is nonrelative ->0 #else -> 1 if file[0] == 'n': Y.append(0) else: Y.append(1) return X, Y
return remove_stopwords(remove_ponctuation(str(s))).upper() def toString(sentence): out = '' if str(sentence) != 'nan': for word in sentence.split(): if isinstance(word, basestring): out += (" " + word) # else: # out += (" " + str(word)) return out stemmer = Stemmer() def DistJaccard(str1, str2): if str1 != '' and str2 != '': str1 = set(str1.split()) str2 = set(str2.split()) return 1.0 - float(len(str1 & str2)) / len(str1 | str2) else: return numpy.nan #--------------------------------------------------------------------------------# # Get products infos from GoldStandard # #--------------------------------------------------------------------------------#
#!/usr/bin/env python3 from pysblgnt import morphgnt_rows from stemming import Stemmer IGNORE_LIST = [ "σαβαχθάνι", "ἔνι", "χρή", ] stemmer = Stemmer("lexicons/morphgnt.yaml") for book_num in range(1, 28): for row in morphgnt_rows(book_num): ccat_pos = row["ccat-pos"] ccat_parse = row["ccat-parse"] norm = row["norm"] lemma = row["lemma"] if ccat_pos != "V-": continue if lemma in IGNORE_LIST: continue if ccat_parse[3] == "N": parse = ccat_parse[1:4]