def tokenize(corpus, lemma=True, punctuation=True, space_to_space=False): if (not punctuation): # table = str.maketrans({key: None for key in string.punctuation}) # corpus = corpus.translate(table) corpus = corpus.replace(',', ' ') corpus = corpus.replace("\u220c", "") corpus = corpus.replace('(', ' ') corpus = corpus.replace(')', ' ') corpus = corpus.replace('.', ' ') corpus = corpus.replace("،", " ") corpus = corpus.replace("«", " ") corpus = corpus.replace("»", " ") if (space_to_space): tokenized = corpus.split(' ') else: tokenized = word_tokenize(corpus) if (lemma): lemmatizer = Lemmatizer() for i in range(len(tokenized)): tokenized[i] = lemmatizer.lemmatize(tokenized[i]).split('#')[0] return tokenized
def stemming_and_lemmatization(token): stemmer = Stemmer() lemmatizer = Lemmatizer() stemmed = stemmer.stem(token) lemmatized = lemmatizer.lemmatize(stemmed) return lemmatized
def evaluate_lemmatizer(conll_file='resources/train.conll', bijankhan_file='resources/bijankhan.txt'): lemmatizer = Lemmatizer() errors = [] output = codecs.open('resources/lemmatizer_errors.txt', 'w', 'utf8') for line in dadegan_text(conll_file).split('\n'): parts = line.split('\t') if len(parts) < 10: continue word, lemma, pos = parts[1], parts[2], parts[3] if lemmatizer.lemmatize(word, pos) != lemma: errors.append((word, lemma, pos, lemmatizer.lemmatize(word, pos))) print(len(errors), 'errors', file=output) counter = Counter(errors) for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True): print(count, *item, file=output) missed = [] output = codecs.open('resources/lemmatizer_missed.txt', 'w', 'utf8') bijankhan = BijankhanReader(bijankhan_file) for sentence in bijankhan.sents(): for word in sentence: if word[1] == 'V': if word[0] == lemmatizer.lemmatize(word[0]): missed.append(word[0]) print(len(missed), 'missed', file=output) counter = Counter(missed) for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True): print(count, item, file=output)
def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer()
def get_lemmatizer(self, document): ''' Lemmatizer ''' content = self.clear_document(document) result = self.split_document(content) lemmatizer = Lemmatizer() lemma_set = [(item, lemmatizer.lemmatize(item)) for item in result] return lemma_set
def lemmatize(target_string): lemmatized_string = "" lemmatizer = Lemmatizer() for single_word in target_string.split(): lemmatized_string += lemmatizer.lemmatize(single_word) + " " return lemmatized_string
def statement_pre_processing(input_statement): normalizer = Normalizer() lemmatizer = Lemmatizer() input_statement = normalizer.normalize(input_statement) input_statement = [ lemmatizer.lemmatize(word) for word in word_tokenize(input_statement) if word not in stops ] return input_statement
def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model')
def preprocess(doc): stemmer = Stemmer() lemmatizer = Lemmatizer() normalizer = Normalizer() doc = normalizer.normalize(doc) tokenized = re.split(' |-', doc) for w in tokenized[:]: if w in stopwords: tokenized.remove(w) stemmed = [stemmer.stem(w) for w in tokenized] new_words = [word for word in stemmed if word.isalnum()] lemmatized = [lemmatizer.lemmatize(w) for w in new_words] return lemmatized
def dataset_cleaner(dataset): statements = [] normalizer = Normalizer() lemmatizer = Lemmatizer() for i in range(len(dataset)): normalized_statement = normalizer.normalize(dataset[i]) # for sentence in sent_tokenize(dataset[i]): word_list = [ lemmatizer.lemmatize(word) for word in word_tokenize(normalized_statement) if word not in stops ] statements.append(word_list) return statements
def find_tokens_in_sentence(sentence_ner, sentence_ner_lem): tokens_lem = [] for token in sentence_ner_lem: if len(tokens_lem) > 0: if token['word'].startswith('##'): tokens_lem[-1]['word'] += ' ' + token['word'][2:] tokens_lem[-1]['index'] += 1 elif token['entity'].split( '-')[1] == tokens_lem[-1]['entity_group'] and token[ 'index'] == tokens_lem[-1]['index'] + 1: tokens_lem[-1]['word'] += ' ' + token['word'] tokens_lem[-1]['index'] += 1 else: tokens_lem += [{ 'word': Lemmatizer().lemmatize(token['word']), 'entity_group': token['entity'].split('-')[1], 'index': token['index'] }] else: tokens_lem += [{ 'word': Lemmatizer().lemmatize(token['word']), 'entity_group': token['entity'].split('-')[1], 'index': token['index'] }] tokens = [] for token in sentence_ner: if len(tokens) > 0: if token['word'].startswith('##'): tokens[-1]['word'] += ' ' + token['word'][2:] tokens[-1]['index'] += 1 elif token['entity'].split( '-')[1] == tokens[-1]['entity_group'] and token[ 'index'] == tokens[-1]['index'] + 1: tokens[-1]['word'] += ' ' + token['word'] tokens[-1]['index'] += 1 else: tokens += [{ 'word': token['word'], 'entity_group': token['entity'].split('-')[1], 'index': token['index'] }] else: tokens += [{ 'word': token['word'], 'entity_group': token['entity'].split('-')[1], 'index': token['index'] }] return tokens, tokens_lem
def stemmer(email): """ :param email: a string of email text :return: a string of input in which for each verb it's root has been replaced """ tokens = '' for word in email.split(): token = Lemmatizer.lemmatize(word) if '#' in token: token = token.split('#') if word in token[0]: token = token[0] else: token = token[1] else: token = Stemmer.convert_to_stem(word) if '&' in token: token = token.split('&') if word in token[0]: token = token[0] else: token = token[1] tokens += token + ' ' return tokens
class LemmaFilter(Filter): def __init__(self): self.lemmatizer = Lemmatizer() def __call__(self, tokens): for token in tokens: token.text = self.lemmatizer.lemmatize(token.text) yield token
def TextCleaner(self): self.stopwordsList= '' Data = self.imported_data stemmer = Stemmer() lemmatizer = Lemmatizer() dataList = Data table = str.maketrans('', '', punctuation) for i in range(0, len(dataList)): for j in range(0, len(dataList[i][0])): dataList[i][0][j] = stemmer.stem(dataList[i][0][j]) dataList[i][0][j] = lemmatizer.lemmatize(dataList[i][0][j]) dataList[i][0] = [word for word in dataList[i][0] if word.isalpha()] dataList[i][0]= [w.translate(table) for w in dataList[i][0]] dataList[i][0] = [word for word in dataList[i][0] if len(word) > 3] self.imported_data = dataList return self.imported_data
def lemmatize(self): """ :return: """ lemmatizer = Lemmatizer() for words in self.words: temp = [] for word in words: word_lemma = lemmatizer.lemmatize(word) if word_lemma is not None: if "#" in word_lemma: temp.append(word_lemma.split("#")[1]) else: temp.append(word_lemma) else: temp.append(word) self.lemmatized_words.append(temp) return self.lemmatized_words
def perform_word_lemmatization(data_dict): from hazm import Lemmatizer lemmatizer = Lemmatizer() return_value = {} for folder_name in data_dict.keys(): return_value[folder_name] = {} for file_name in data_dict[folder_name].keys(): this_files_words = [] for sent_text in data_dict[folder_name][file_name]: this_sentences_words = [] for word in sent_text: lemma_word = lemmatizer.lemmatize(word) this_sentences_words.append(lemma_word) this_files_words.append(this_sentences_words) return_value[folder_name][file_name] = this_files_words return return_value
def process_text(text): normalize=Normalizer() text=normalize.normalize(text) text = text.replace("_", " ") text = text.replace(',', ' ') text=text.replace("\u220c","") text=text.replace("\u200c","") text=text.replace("-","") # text = text.replace('/', ' ') text = text.replace('(', ' ') text = text.replace(')', ' ') text = text.replace('.', ' ') text=text.replace("،"," ") text=text.replace("«"," ") text=text.replace("»"," ") # Convert text string to a list of words t = re.findall("[\u0627-\u06FF]+|<S>|</s>|\?|//", text) # just split word by space to space and omit other thing lemma=Lemmatizer() text=[lemma.lemmatize(x) for x in t] return text
def tokenizer(input_var): tokenized = [] normalizer1 = Normalizer(True, False, False) normalizer2 = Normalizer(False, True, False) normalizer3 = Normalizer(False, False, True) word_tokenizer = WordTokenizer(False) input_var = normalizer1.normalize( normalizer2.normalize(normalizer3.normalize(input_var))) actual = word_tokenizer.tokenize(input_var) lemmatizer = Lemmatizer() # stemmer = Stemmer for x in actual: # print(x); s = lemmatizer.lemmatize(x) if "#" in s and s.split("#")[0] != "": tokenized.append(s.split("#")[0] + "ن") else: tokenized.append(s.replace("#", "")) return tokenized
def train_dependency_parser(train_file='resources/train.conll', test_file='resources/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/options.xml', features_file='resources/features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'): def read_conll(conll_file): trees = [DependencyGraph(item) for item in dadegan_text(conll_file).replace(' ', '_').split('\n\n') if item.strip()] sentences = [[node['word'] for node in tree.nodelist[1:]] for tree in trees] return trees, sentences lemmatizer, tagger = Lemmatizer(), POSTagger() trees, sentences = read_conll(train_file) tagged = tagger.batch_tag(sentences) train_data = train_file +'.data' with codecs.open(train_data, 'w', 'utf8') as output: for tree, sentence in zip(trees, tagged): for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1): node['tag'] = word[1] node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag']) print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output) print(file=output) cmd = ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn'] process = subprocess.Popen(cmd) process.wait() # evaluation print('\nEvaluating trained model on test data:') parser = DependencyParser(tagger=tagger, model_file=model_file) trees, sentences = read_conll(test_file) tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) test_data, test_results = test_file +'.data', test_file +'.results' print('\n'.join([sentence.to_conll(10) for sentence in trees]).strip(), file=codecs.open(test_data, 'w', 'utf8')) print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8')) cmd = ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results] process = subprocess.Popen(cmd) process.wait()
def pipeline_sentence(sentence, model, tokenizer): sentence = change_words(sentence) normalizer = Normalizer() sentence = normalizer.normalize(sentence) sentence_lem = ' '.join([ Lemmatizer().lemmatize(x) for x in word_tokenize(normalizer.normalize(sentence)) ]) nlp = pipeline("ner", model=model, tokenizer=tokenizer) sentence_ner = nlp(sentence) sentence_ner_lem = nlp(sentence_lem) return sentence_ner, sentence_ner_lem, sentence_lem, sentence
class HazmTokenizer(Component): defaults = {"stemmer": True, "lemmatizer": True, 'pos': False} def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model') def required_packages(self) -> List[Text]: return ['hazm'] def process(self, message: Message, **kwargs: Any) -> None: text = message.text for sentence_str in sent_tokenize(text): sentence = Sentence(sentence_str) tokens = word_tokenize(sentence_str) pos_tags = [] if self.component_config.pos: pos_tags = self._pos_tagger.tag(tokens) for idx, token_str in enumerate(tokens): token = Token(text=token_str) if self.component_config.stemmer: token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str) if self.component_config.lemmatizer: token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize( token_str) if self.component_config.pos: token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1] sentence.add_token(token) message.add_sentence(sentence)
class POS(): def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer() def posTaggerTXT(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:20s} {:20s} {:20s} {:20s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(t) line = f.readline() def posTaggerHTML(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: o.write(self.preHTML()) line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:s} -//- {:s} -//- {:s} -//- {:s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(self.divHTML(self.colorTag(li[1]), t)) o.write("\n") line = f.readline() o.write(self.posHTML()) def nameTag(self, tag): if tag == "V": return "فعل" elif tag == "N": return "اسم" elif tag == "ADV": return "قید" elif tag == "PRO": return "ضمیر" elif tag == "PUNC": return "نشانه نگارشی" elif tag == "Ne": return "غیر قابل تشخیص" elif tag == "NUM": return "عدد" elif tag == "CONJ": return "حرف ربط" elif tag == "POSTP": return "نشانه مفعولی" elif tag == "P": return "حرف اضافه" elif tag == "AJ": return "صفت" elif tag == "DET": return "ضمیر اشاره" else: return tag def colorTag(self, tag): if tag == "V": return "red" elif tag == "N": return "hotpink" elif tag == "ADV": return "blue" elif tag == "PRO": return "gold" elif tag == "PUNC": return "lightblue" elif tag == "Ne": return "darkgray" elif tag == "NUM": return "white" elif tag == "CONJ": return "lightgreen" elif tag == "POSTP": return "white" elif tag == "P": return "aqua" elif tag == "AJ": return "teal" elif tag == "DET": return "slateblue" else: return "white" def preHTML(self): return """<!DOCTYPE html> <head> <meta charset="UTF-8"> </head> <body> """ def posHTML(self): return """ </body> </html>""" def divHTML(self, color, text): return """ <div style="background-color:""" + color + """"> """ + """<h4>""" + text + """</h4> """ + """</div>
def Evaluate_lemmatizer(inputs, labels, lib='hazm'): predicted_labels_with_pos = [] predicted_labels_no_pos = [] if lib == 'hazm': lemmatizer = Lemmatizer() for sentence in inputs: sent_labels_with_pos = [] sent_labels_no_pos = [] for (word, pos) in sentence: if pos == 'ADJ': pos = 'AJ' sent_labels_with_pos.append(lemmatizer.lemmatize(word, pos)) sent_labels_no_pos.append(lemmatizer.lemmatize(word)) predicted_labels_with_pos.append(sent_labels_with_pos) predicted_labels_no_pos.append(sent_labels_no_pos) elif lib == 'parsivar': stemmer = FindStems() for sentence in inputs: sent_labels_with_pos = [] sent_labels_no_pos = [] for (word, pos) in sentence: sent_labels_with_pos.append(stemmer.convert_to_stem(word, pos)) sent_labels_no_pos.append(stemmer.convert_to_stem(word)) for i in range(len(sentence)): if sentence[i][1] == 'V': sent_labels_with_pos[i] = re.sub(r"&", r"#", sent_labels_with_pos[i]) sent_labels_no_pos[i] = re.sub(r"&", r"#", sent_labels_no_pos[i]) predicted_labels_with_pos.append(sent_labels_with_pos) predicted_labels_no_pos.append(sent_labels_no_pos) precisions_with_pos = [] precisions_no_pos = [] all_truly_labeled_with_pos = [] for i in range(len(labels)): truly_labeled_with_pos = [ predicted_labels_with_pos[i][j] == labels[i][j] for j in range(len(labels[i])) ] all_truly_labeled_with_pos.append(truly_labeled_with_pos) num_truly_labeled_with_pos = sum(truly_labeled_with_pos) truly_labeled_no_pos = [ predicted_labels_no_pos[i][j] == labels[i][j] for j in range(len(labels[i])) ] num_truly_labeled_no_pos = sum(truly_labeled_no_pos) precision_with_pos = num_truly_labeled_with_pos / len(labels[i]) precision_no_pos = num_truly_labeled_no_pos / len(labels[i]) precisions_with_pos.append(precision_with_pos) precisions_no_pos.append(precision_no_pos) per_pos = {} detailed_analyze = {} for i in range(len(inputs)): for j in range(len(inputs[i])): if inputs[i][j][1] not in per_pos.keys(): per_pos[inputs[i][j][1]] = {'true': 0, 'false': 0} if all_truly_labeled_with_pos[i][j]: per_pos[inputs[i][j][1]]['true'] += 1 else: per_pos[inputs[i][j][1]]['false'] += 1 if inputs[i][j][1] not in detailed_analyze.keys(): detailed_analyze[inputs[i][j][1]] = {'true': [], 'false': []} # detailed_analyze[inputs[i][j][1]]['gold'].append(labels[i][j]) if all_truly_labeled_with_pos[i][j]: detailed_analyze[inputs[i][j][1]]['true'].append( inputs[i][j][0]) # detailed_analyze[inputs[i][j][1]]['false'].append('NONE') else: detailed_analyze[inputs[i][j][1]]['false'].append( inputs[i][j][0]) # detailed_analyze[inputs[i][j][1]]['true'].append('NONE') accuracy_per_pos = { k: v['true'] / (v['true'] + v['false']) for k, v in per_pos.items() } for k, v in detailed_analyze.items(): v['true'] = set(v['true']) v['false'] = set(v['false']) precision_with_pos = sum(precisions_with_pos) / len(precisions_with_pos) precision_no_pos = sum(precisions_no_pos) / len(precisions_no_pos) return precision_with_pos, precision_no_pos, accuracy_per_pos, detailed_analyze
import warnings warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') from gensim.models.doc2vec import Doc2Vec, TaggedDocument from hazm import Lemmatizer, Normalizer, word_tokenize if __name__ == "__main__": data = [ "من به یادگیری ماشین بسیار علاقهمند هستم", "من عاشق کدنویسی با پایتون هستم", "من عاشق ساختن نرمافزارهای هوشمند هستم", "هوشمندسازی یک نرمافزار فرآیندی بسیار پیچیده است" ] normalizer = Normalizer() lemmatizer = Lemmatizer() data = [normalizer.normalize(_d) for i, _d in enumerate(data)] lemmatizer = Lemmatizer() tagged_data = [ TaggedDocument(words=[ lemmatizer.lemmatize(_d) for i, _d in enumerate( word_tokenize(normalizer.normalize(_d.lower()))) ], tags=[str(i)]) for i, _d in enumerate(data) ] vec_size = 100 alpha = 0.025 model = Doc2Vec(vec_size=vec_size,
from hazm import Normalizer from hazm import Lemmatizer from hazm import word_tokenize PUNCS = ['،', '.', ',', ':', ';', '"'] NORMALIZER = Normalizer() LEMMATIZER = Lemmatizer() def text_cleaner(text): normalized = NORMALIZER.normalize(text) tokenized = word_tokenizer(normalized) tokens = [] for t in tokenized: temp = t for p in PUNCS: temp = temp.replace(p, '') tokens.append(temp) tokens = [w for w in tokens if not len(w) <= 1] tokens = [w for w in tokens if not w.isdigit()] tokens = [LEMMATIZER.lemmatize(w) for w in tokens] return ' '.join(tokens) def word_tokenizer(text): return word_tokenize(text)
def __init__(self): self.lemmatizer = Lemmatizer()
# from parsivar import FindStems # my_stemmer = FindStems() # print(my_stemmer.convert_to_stem('درافتادن')) # from hazm import Stemmer # stemmer = Stemmer() # print(stemmer.stem('کتابها')) from hazm import Lemmatizer lemmatizer = Lemmatizer() print(lemmatizer.lemmatize("آبادگری"))
def lemmatizer(tweets): lemmatizer_tweets = [] for tweet in tweets: lemmatizer_tweets.append(Lemmatizer().lemmatize(tweet)) return lemmatizer_tweets
import warnings warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') warnings.simplefilter(action='ignore', category=FutureWarning) from gensim.models.doc2vec import Doc2Vec, TaggedDocument from hazm import Lemmatizer, Normalizer, word_tokenize from pyravendb.store import document_store from DocumentObjects import Document import pandas as pd import json if __name__ == "__main__": # data = pd.read_excel("dataset.xlsx") # data = list(data["description"].astype(str).values.flatten()) normalizer = Normalizer() lemmatizer = Lemmatizer() model = Doc2Vec.load("BasicModel") test_str = "بازکردن فلنج ها جهت نصب صفحات مسدود کننده" test_str = normalizer.normalize(test_str) test_data = [lemmatizer.lemmatize(_d) for i, _d in enumerate(word_tokenize(test_str))] result = model.docvecs.most_similar([model.infer_vector(test_data)], topn=10) store = document_store.DocumentStore(urls=["http://localhost:8080"], database="SeSimi") store.initialize() with store.open_session() as session: print() [print(list(session.query(collection_name='Documents').where(key=result[i][0]))[0].title) for i in range(0, len(result))]
stopwords_f = open('stop_words.txt', 'r', encoding='utf-8') stopwords = stopwords_f.readlines() for i in range(len(stopwords)): stopwords[i] = stopwords[i].replace("\n", "") samewords_f = open('same_words.txt', 'r', encoding='utf-8') samewords = samewords_f.readlines() #samewords_tokens = word_tokenize(samewords_f.read(),"\n") for i in range(len(samewords)): samewords[i] = samewords[i].replace("\n", "") samewords[i] = word_tokenize(samewords[i]) #print('same=' + str(samewords)) samewords_f.close() stopwords_f.close() #print('stop='+str(stopwords)) lemmatizer = Lemmatizer() normalizer = Normalizer() #print(query_process("ما تو را کودک،. کتابهای به برای دوست داریم خودرو را هنوز اتومبیل")) @app.route('/api/dataframe', methods=['GET']) def df(): return j def find_in_dictionary(word,dictionary1): if word in dictionary1: return dictionary1[word].copy()#returns a list of docIDs with where the term is else: return []
class Preprocessor: normalizer = Normalizer() stemmer = Stemmer() lemmatizer = Lemmatizer() tokenizer = WordTokenizer() stop_words = stopwords_list() @staticmethod def remove_noise(text: str) -> str: return Preprocessor.__remove_punctuation( Preprocessor.__remove_emojis(text)) @staticmethod def remove_stop_words(tokens: List) -> str: return [t for t in tokens if t not in Preprocessor.stop_words] @staticmethod def __remove_emojis(text: str): emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" "]+", flags=re.UNICODE) first_cleaned_text = emoji_pattern.sub(r'', text) # no emoji return emoji.get_emoji_regexp().sub(r'', first_cleaned_text) @staticmethod def __remove_punctuation(text: str): try: return re.sub( r'[\.\?\!\,\:\;\،\(\)\؛\#\%\^\&\$\~\'\"\×\-\_\*\>\<\+\=\\\/]', '', text) except TypeError as e: print(e, text) @staticmethod def normalize(text: str) -> str: return Preprocessor.normalizer.normalize(text) @staticmethod def stem(word: str) -> str: return Preprocessor.stemmer.stem(word) @staticmethod def lemmatize(word: str) -> str: return Preprocessor.lemmatizer.lemmatize(word) @staticmethod def tokenize(text: str) -> str: return Preprocessor.tokenizer.tokenize(text) @staticmethod def preprocess(text: str) -> str: cleaned_text = Preprocessor.remove_noise(str(text)) normalized_text = Preprocessor.normalize(cleaned_text) tokens = Preprocessor.tokenize(normalized_text) none_stop_words = Preprocessor.remove_stop_words(tokens) # stems = [Preprocessor.stem(w) for w in tokens] lemmatized = [Preprocessor.lemmatize(w) for w in none_stop_words] return ' '.join(lemmatized)