def apply_settings(self, x): if x == 'word': self.tokenizer_name = x elif x == 'sent': self.tokenizer_name = x if x == 'nltk': # self.stop_dict = stopwords.words('english') self.stop_dict_name = x elif x == 'Extend': # self.stop_dict = stopwords.words('english') self.stop_dict_name = x if x == 'Count': self.vec = CountVectorizer() self.vec_name = x elif x == 'TfiDf': # print(self.vec) self.vec = TfidfVectorizer() self.vec_name = x # print(self.vec) if x == 'Porter': self.stemmer = PorterStemmer() self.stemmer_name = x elif x == 'SnowBall': self.stemmer = SnowballStemmer(language='english', ignore_stopwords=True) self.stemmer_name = x elif x == 'ISR': self.stemmer = ISRIStemmer() self.stemmer_name = x
def stemming(self, text): self.ps = ISRIStemmer() text = text.split() self.stemmed_words = [] for word in text: self.stemmed_words.append(self.ps.stem(word)) return " ".join(self.stemmed_words)
def ArabicStemming(text): st=ISRIStemmer() stemmedwords=[] word=text for w in word: stemmedwords.append(st.stem(w)) return stemmedwords
def stem(self, txt): st = ISRIStemmer() stem_words = [] words = self.tokenize(txt) for w in words: stem_words.append(st.stem(w)) return stem_words
def request_tokenizing(req_text_path, save_path='../data/user_requests'): """ :param req_text_path: the path to the request .txt file :param save_path: the path to save the structured .xml output :return: read a request, tokenize & stem it and save the info in structured xml file """ f = open(req_text_path, encoding='utf8') line = f.readline() f.close() # reading stop words stop_words = stopwords.words('arabic') # deleting punctuation THIS IS A MATTER OF DISCUSSION line = re.sub(r'[^\w\s]', '', line) # NLP tokenizer = WordPunctTokenizer() stemmer = ISRIStemmer() tokens = tokenizer.tokenize(line) root = ET.Element('root') tok_elem = ET.SubElement(root, 'tokenization') i = 1 sw = False for t in tokens: stem = stemmer.stem(t) if str(t) in stop_words: sw = True ET.SubElement(tok_elem, 'word', id=str(i), value=str(t), stop_word=str(sw), stem=stem) i += 1 sw = False file_str = save_path + '/' + req_text_path.split('/')[-1].split( '.')[0] + '.xml' xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") with open(file_str, 'w', encoding='utf-8') as f: f.write(xmlstr) f.close() return file_str
def __init__(self): # initialise default words self.stop_dict = set(stopwords.words('english')).union(punctuation) self.vec = CountVectorizer() self.stemmer = PorterStemmer() self.lemmat = WordNetLemmatizer() self.stop_dict_name = 'nltk' self.vec_name = 'Count Vectorizer' self.stemmer_name = 'ProterStemmer' self.tokenizer_name = 'word' self.web_stop_words = [ "a", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "d", "definitely", "described", "despite", "did", "didn't", "different", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "'s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there's", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various", "very", "via", "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "who's", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "z", "zero", "html", "ol" ] self.stop_web_punct = self.stop_dict.union(self.web_stop_words)
class nlp(): def __init__(self): # initialise default words self.stop_dict = set(stopwords.words('english')).union(punctuation) self.vec = CountVectorizer() self.stemmer = PorterStemmer() self.lemmat = WordNetLemmatizer() self.stop_dict_name = 'nltk' self.vec_name = 'Count Vectorizer' self.stemmer_name = 'ProterStemmer' self.tokenizer_name = 'word' self.web_stop_words = [ "a", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "d", "definitely", "described", "despite", "did", "didn't", "different", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "'s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there's", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various", "very", "via", "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "who's", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "z", "zero", "html", "ol" ] self.stop_web_punct = self.stop_dict.union(self.web_stop_words) def get_settings(self): return [ self.tokenizer_name, self.stop_dict_name, self.vec_name, self.stemmer_name ] def apply_settings(self, x): if x == 'word': self.tokenizer_name = x elif x == 'sent': self.tokenizer_name = x if x == 'nltk': # self.stop_dict = stopwords.words('english') self.stop_dict_name = x elif x == 'Extend': # self.stop_dict = stopwords.words('english') self.stop_dict_name = x if x == 'Count': self.vec = CountVectorizer() self.vec_name = x elif x == 'TfiDf': # print(self.vec) self.vec = TfidfVectorizer() self.vec_name = x # print(self.vec) if x == 'Porter': self.stemmer = PorterStemmer() self.stemmer_name = x elif x == 'SnowBall': self.stemmer = SnowballStemmer(language='english', ignore_stopwords=True) self.stemmer_name = x elif x == 'ISR': self.stemmer = ISRIStemmer() self.stemmer_name = x def get_tokens(self, input): if self.tokenizer_name == 'word': return str(word_tokenize(input)) elif self.tokenizer_name == 'sent': return str(sent_tokenize(input)) else: print('error invaid tokenizer type') def get_stemmer(self, input): #need to add different stemmers here text = '' if self.tokenizer_name == 'word': for i in word_tokenize(input): text += self.stemmer.stem(i) + ' ' elif self.tokenizer_name == 'sent': for i in sent_tokenize(input): text += self.stemmer.stem(i) + ' ' return text def valid_char(self, input): for j in input: if ord(j) <= 126 and ord(j) >= 33: continue else: return False return True def get_stopwords(self, input): input = re.sub( '[!@#$%^&*()\n_:><?\-.{}|+-,;""``~`—]|[0-9]|/|=|\[\]|\[\[\]\]', ' ', input) input = re.sub('[“’\']', '', input) # print('input after regex',input) if self.stop_dict_name == 'nltk': return str( list(i for i in word_tokenize(input) if i not in self.stop_dict and not i.split( '.')[-1].isdigit() and not i.split(',')[-1].isdigit() and len(i) > 1 and self.valid_char(i))) elif self.stop_dict_name == 'Extend': # stopwords_en_punct = set(stopwords.words('english')).union(punctuation) return str( list(i for i in word_tokenize(input) if i not in self.stop_web_punct and not i.split( '.')[-1].isdigit() and not i.split(',')[-1].isdigit() and len(i) > 1 and self.valid_char(i))) def get_vec(self, input): # if type == 'Count': # print(self.vec) return str(self.vec.fit_transform([input]).toarray()) # elif type == 'TfiDf': # return str(TfidfVectorizer().fit_transform([input]).toarray()) def penn2morphy(self, penntag): """ Converts Penn Treebank tags to WordNet. """ morphy_tag = {'NN': 'n', 'JJ': 'a', 'VB': 'v', 'RB': 'r'} try: return morphy_tag[penntag[:2]] except: return 'n' def lemmatize_sent(self, text): wnl = WordNetLemmatizer() # Text input is string, returns lowercased strings. return str([ wnl.lemmatize(word.lower(), pos=self.penn2morphy(tag)) for word, tag in pos_tag(word_tokenize(text)) ]) def nlp_cleaner(self, x, inf=0): if type(x) != str: return 'invalid input , input must be string' #1 word tokenization , lowercasing x = re.sub( '[!@#$%^&*()\n_:><?\-.{}|+-,;""``~`—]|[0-9]|/|=|\[\]|\[\[\]\]', ' ', x) x = re.sub('[“’\']', '', x) if self.tokenizer_name == 'word': x = list(map(str.lower, word_tokenize(x))) elif self.tokenizer_name == 'sent': x = list(map(str.lower, sent_tokenize(x))) print(self.tokenizer_name) if inf: print('tokenizer') print(x[0:10]) #2 stop words , removing punctuations if self.stop_dict_name == 'nltk': x = list( i for i in x if i not in self.stop_dict and not i.split('.')[-1].isdigit() and not i.split(',')[-1].isdigit() and len(i) > 1 and self.valid_char(i)) elif self.stop_dict_name == 'Extend': x = list(i for i in x if i not in self.stop_web_punct and not i.split( '.')[-1].isdigit() and not i.split(',')[-1].isdigit() and len(i) > 1 and self.valid_char(i)) # print(self.stop_dict_name) if inf: print('StopWords') print(x[0:10]) #3 Stemming and Lemmatization if self.stemmer_name == 'Porter': x = list(self.stemmer.stem(i) for i in x) elif self.stemmer_name == 'SnowBall': x = list(self.stemmer.stem(i) for i in x) elif self.stemmer_name == 'ISR': x = list(self.stemmer.stem(i) for i in x) if inf: print('after stemming') print(x[0:10]) return x def create_vec(self, name, col='STORY', encoding='UTF-8'): ext = name.split('.')[-1] if ext == 'csv': data = pd.read_csv(name, encoding=encoding) elif ext == 'xlsx': data = pd.read_excel(io=name, encoding=encoding) data = self.vec.fit_transform(data[col]) pickle.dump(data, open('vec_metrix_.txt', 'wb')) pickle.dump(self.vec, open(self.vec_name + '.txt', 'wb'))
from keras.utils import np_utils import tensorflow as tf from keras import backend as K BATCH_SIZE = 16 # Batch size for GPU NUM_WORDS = 10000 # Vocab length MAX_LEN = 20 # Padding length (# of words) LSTM_EMBED = 8 # Number of LSTM nodes K.set_learning_phase(False) data = pd.read_csv('../dataset/ASKFM-master/full_dataset.csv') tokenizer = cPickle.load( open("../models/lstm-autoencoder-tokenizer.pickle", "rb")) stemmer = ISRIStemmer() stemmer = ISRIStemmer() # Read the encoder model model = tf.keras.models.load_model('../models/lstm25/lstm-encoder.h5', compile=False) model.load_weights('../models/lstm_encoder_weights.h5') model.compile(loss='sparse_categorical_crossentropy', optimizer='adam') # Create the encoding function encode = K.function([model.input, K.learning_phase()], [model.layers[1].output]) Questions = tokenizer.texts_to_sequences(data.Question) # We pad sequences that are shorter than MAX_LEN
from pyparsing import StringEnd, oneOf, FollowedBy, Optional, ZeroOrMore, SkipTo file = open( "C:\\Users\Administrator\\Desktop\\myfolder\\corpora\\stats\\ielts-7to11-some.txt" ) raw = file.read() try: wordlist = nltk.word_tokenize(raw) lemmatizer = WordNetLemmatizer() print lemmatizer.lemmatize("ran") lanster = LancasterStemmer() porter = PorterStemmer() snowball = SnowballStemmer("english") isri = ISRIStemmer() rslp = RSLPStemmer() porter2 = Stemmer('english') endOfString = StringEnd() prefix = oneOf( "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back" ) suffix = oneOf("ish") #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive " # "ative tude ence ance ise ant age cide ium ion") word = (Optional(prefix)("prefixes") + SkipTo(suffix | suffix + FollowedBy(endOfString) | endOfString)("root") + ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix"))
def batches_generator(train_data, batch_size=32): # For OHE inputs num_words = np.max(train_data) + 1 timesteps = train_data.shape[1] while True: indices = np.random.choice(len(train_data), size=batch_size) X = train_data[indices] X = np_utils.to_categorical(X, num_words) X = X.reshape((batch_size, timesteps, num_words)) yield (X, X) train_data = pd.read_csv( "/home/omar/DataScience/DataSets/askfm/full_dataset.csv") stemmer = ISRIStemmer() # We don't need the answers, so let's drop them train_data.drop('Answer', inplace=True, axis=1) train_data = train_data[ train_data.Question.apply(lambda x: len(x.split())) < MAX_LEN] train_data.Question = train_data.Question.apply( lambda x: (re.sub('[^\u0620-\uFEF0\s]', '', x)).strip()) train_data = train_data[train_data.Question.apply(len) > 0] # Stem the words train_data.Question = train_data.Question.apply( lambda x: " ".join([stemmer.stem(i) for i in x.split()]))
class Model: """docstring for Moddel""" def __init__(self): self.sc = StandardScaler() self.sex_enc = LabelEncoder() self.imputer = Imputer() self.classifier = LogisticRegression() def cleaner(self, text): text = text.lower() text = re.sub("@[^\s]+", "", text) text = text.replace(":)", "") text = text.replace("@", "") text = text.replace("#", "") text = text.replace(":(", "") return text def remove_stop_words(self, text): self.sw = stopwords.words("arabic") self.clean_words = [] text = text.split() for word in text: if word not in self.sw: self.clean_words.append(word) return " ".join(self.clean_words) def stemming(self, text): self.ps = ISRIStemmer() text = text.split() self.stemmed_words = [] for word in text: self.stemmed_words.append(self.ps.stem(word)) return " ".join(self.stemmed_words) def run(self, text): text = self.cleaner(text) text = self.remove_stop_words(text) text = self.stemming(text) return text def read_df(self, path): self.df = pd.read_csv(path) def preprocessing(self): self.df['txt'] = self.df['txt'].apply(self.run) def split_df(self): self.tfidf = TfidfVectorizer() self.x = self.tfidf.fit_transform(self.df["txt"]).toarray() self.y = self.df['sentiment'].values def train_test(self, test_size): self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.x, self.y, test_size=test_size, random_state=0) def train(self, classy): self.read_df("ASTD.csv") self.preprocessing() self.split_df() self.train_test(0.25) if (classy == "logistic"): self.classifier.fit(self.x_train, self.y_train) if (classy == "SVC"): self.classifier = SVC() self.classifier.fit(self.x_train, self.y_train) if (classy == "KNN"): self.classifier = KNeighborsClassifier() self.classifier.fit(self.x_train, self.y_train) self.y_pred = self.classifier.predict(self.x_test) return classification_report(self.y_test, self.y_pred) def evaluate(self): return self.classifier.score(self.x_test, self.y_test) def predict(self, test): test = self.run(test) test = self.tfidf.transform([test]).toarray() #test = self.sc.transform([test]) return self.classifier.predict(test)
class Preprocess(object): _valid_lang = ['en', 'cn', 'ar'] _stemmer = ISRIStemmer() def __init__(self, word_seg_config={}, doc_filter_config={}, word_stem_config={}, word_lower_config={}, word_filter_config={}, word_index_config={}): # set default configuration self._word_seg_config = {'enable': True, 'lang': 'ar'} self._doc_filter_config = { 'enable': True, 'min_len': 0, 'max_len': six.MAXSIZE } self._word_stem_config = {'enable': True} self._word_lower_config = {'enable': [False]} self._word_filter_config = { 'enable': False, 'stop_words': stopwords.words('arabic'), 'min_freq': 1, 'max_freq': six.MAXSIZE, 'words_useless': None } self._word_index_config = {'word_dict': None} self._word_seg_config.update(word_seg_config) self._doc_filter_config.update(doc_filter_config) self._word_stem_config.update(word_stem_config) self._word_lower_config.update(word_lower_config) self._word_filter_config.update(word_filter_config) self._word_index_config.update(word_index_config) self._word_dict = self._word_index_config['word_dict'] self._words_stats = dict() def run(self, file_path): print('load...') dids, docs = Preprocess.load(file_path) if self._word_seg_config['enable']: print('word_seg...') docs = Preprocess.word_seg(docs, self._word_seg_config) if self._doc_filter_config['enable']: print('doc_filter...') dids, docs = Preprocess.doc_filter(dids, docs, self._doc_filter_config) if self._word_stem_config['enable']: print('word_stem...') docs = Preprocess.word_stem(docs) if self._word_lower_config['enable']: print('word_lower...') docs = Preprocess.word_lower(docs) self._words_stats = Preprocess.cal_words_stat(docs) if self._word_filter_config['enable']: print('word_filter...') docs, self._words_useless = Preprocess.word_filter( docs, self._word_filter_config, self._words_stats) print('word_index...') docs, self._word_dict = Preprocess.word_index(docs, self._word_index_config) return dids, docs @staticmethod def parse(line): subs = line.split(' ', 1) if 1 == len(subs): return subs[0], '' else: return subs[0], subs[1] @staticmethod def load(file_path): dids = list() docs = list() f = codecs.open(file_path, 'r', encoding='utf8') for line in tqdm(f): line = line.strip() if '' != line: did, doc = Preprocess.parse(line) dids.append(did) docs.append(doc) f.close() return dids, docs @staticmethod def word_seg_ar(docs): docs = [wordpunct_tokenize(sent) for sent in tqdm(docs)] # show the progress of word segmentation with tqdm '''docs_seg = [] print('docs size', len(docs)) for i in tqdm(range(len(docs))): docs_seg.append(word_tokenize(docs[i]))''' return docs @staticmethod def word_seg_en(docs): docs = [word_tokenize(sent) for sent in tqdm(docs)] # show the progress of word segmentation with tqdm '''docs_seg = [] print('docs size', len(docs)) for i in tqdm(range(len(docs))): docs_seg.append(word_tokenize(docs[i]))''' return docs @staticmethod def word_seg_cn(docs): docs = [list(jieba.cut(sent)) for sent in docs] return docs @staticmethod def word_seg(docs, config): assert config['lang'].lower( ) in Preprocess._valid_lang, 'Wrong language type: %s' % config['lang'] docs = getattr( Preprocess, '%s_%s' % (sys._getframe().f_code.co_name, config['lang']))(docs) return docs @staticmethod def cal_words_stat(docs): words_stats = {} docs_num = len(docs) for ws in docs: for w in ws: if w not in words_stats: words_stats[w] = {} words_stats[w]['cf'] = 0 words_stats[w]['df'] = 0 words_stats[w]['idf'] = 0 words_stats[w]['cf'] += 1 for w in set(ws): words_stats[w]['df'] += 1 for w, winfo in words_stats.items(): words_stats[w]['idf'] = np.log( (1. + docs_num) / (1. + winfo['df'])) return words_stats @staticmethod def word_filter(docs, config, words_stats): if config['words_useless'] is None: config['words_useless'] = set() # filter with stop_words config['words_useless'].update(config['stop_words']) # filter with min_freq and max_freq for w, winfo in words_stats.items(): # filter too frequent words or rare words if config['min_freq'] > winfo['df'] or config[ 'max_freq'] < winfo['df']: config['words_useless'].add(w) # filter with useless words docs = [[w for w in ws if w not in config['words_useless']] for ws in tqdm(docs)] return docs, config['words_useless'] @staticmethod def doc_filter(dids, docs, config): new_docs = list() new_dids = list() for i in tqdm(range(len(docs))): if config['min_len'] <= len(docs[i]) <= config['max_len']: new_docs.append(docs[i]) new_dids.append(dids[i]) return new_dids, new_docs @staticmethod def word_stem(docs): docs = [[Preprocess._stemmer.stem(w) for w in ws] for ws in tqdm(docs)] return docs @staticmethod def word_lower(docs): docs = [[w.lower() for w in ws] for ws in tqdm(docs)] return docs @staticmethod def build_word_dict(docs): word_dict = dict() for ws in docs: for w in ws: word_dict.setdefault(w, len(word_dict)) return word_dict @staticmethod def word_index(docs, config): if config['word_dict'] is None: config['word_dict'] = Preprocess.build_word_dict(docs) docs = [[ config['word_dict'][w] for w in ws if w in config['word_dict'] ] for ws in tqdm(docs)] return docs, config['word_dict'] @staticmethod def save_lines(file_path, lines): f = codecs.open(file_path, 'w', encoding='utf8') for line in lines: line = line f.write(line + "\n") f.close() @staticmethod def load_lines(file_path): f = codecs.open(file_path, 'r', encoding='utf8') lines = f.readlines() f.close() return lines @staticmethod def save_dict(file_path, dic, sort=False): if sort: dic = sorted(dic.items(), key=lambda d: d[1], reverse=False) lines = ['%s %s' % (k, v) for k, v in dic] else: lines = ['%s %s' % (k, v) for k, v in dic.items()] Preprocess.save_lines(file_path, lines) @staticmethod def load_dict(file_path): lines = Preprocess.load_lines(file_path) dic = dict() for line in lines: k, v = line.split() dic[k] = v return dic def save_words_useless(self, words_useless_fp): Preprocess.save_lines(words_useless_fp, self._words_useless) def load_words_useless(self, words_useless_fp): self._words_useless = set(Preprocess.load_lines(words_useless_fp)) def save_word_dict(self, word_dict_fp, sort=False): Preprocess.save_dict(word_dict_fp, self._word_dict, sort) def load_word_dict(self, word_dict_fp): self._word_dict = Preprocess.load_dict(word_dict_fp) def save_words_stats(self, words_stats_fp, sort=False): if sort: word_dic = sorted(self._word_dict.items(), key=lambda d: d[1], reverse=False) lines = [ '%s %d %d %f' % (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'], self._words_stats[w]['idf']) for w, wid in word_dic ] else: lines = [ '%s %d %d %f' % (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'], self._words_stats[w]['idf']) for w, wid in self._word_dict.items() ] Preprocess.save_lines(words_stats_fp, lines) def load_words_stats(self, words_stats_fp): lines = Preprocess.load_lines(words_stats_fp) for line in lines: wid, cf, df, idf = line.split() self._words_stats[wid] = {} self._words_stats[wid]['cf'] = int(cf) self._words_stats[wid]['df'] = int(df) self._words_stats[wid]['idf'] = float(idf)