class fastTextEmbedder: def __init__(self, model_path): self.model_path = model_path print("loading fastText model ...") #self.model = pickle.load(open(self.model_path,"rb")) self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore') print("done fastText loading model") self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' self.vocab = self.model.vocab def tokenize_string(self, str): """ :param str: string sentence :return: tokens stemmed with respect to nltk """ str_tokens = self.tokenizer.tokenize(str) tokens_stemmed = [] for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: tokens_stemmed.append((token, self.stemmer.stem(token))) return tokens_stemmed def embed_tokens(self, sent, max_len): sent_tokens = self.tokenize_string(sent) embedding = np.zeros((max_len, 300)) i = j = 0 for i in range(0, min(len(sent_tokens), max_len)): e = np.zeros(300) if sent_tokens[i][0] in self.vocab: embedding[j] = self.model[sent_tokens[i][0]] j += 1 elif sent_tokens[i][1] in self.vocab: embedding[j] = self.model[sent_tokens[i][1]] j += 1 return embedding def embed(self, sent): """ :param sent: string sentence :return: embedding of sentence as np vector of dim=300 """ sent_tokens = self.tokenize_string(sent) embedding = np.zeros(300) for token in sent_tokens: if token[0] in self.vocab: embedding += self.model[token[0]] elif token[1] in self.vocab: embedding += self.model[token[1]] return embedding
class countReader: SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' def __init__(self, P): self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.docs = self.get_answer_canditates(P) docs_stem = [] for doc in self.docs: docs_stem.append(self.stem_string(doc)) # self.stopwords = stopwords.words('arabic') self.stopwords = open("stopwords-ur.txt").read().splitlines() self.stopwords = [i.lower() for i in self.stopwords] self.vectorizer = CountVectorizer( ngram_range=(1, 4)) # , stop_words=self.stopwords) self.count_matrix = self.vectorizer.fit_transform(docs_stem) def stem_string(self, str): str_tokens = self.tokenizer.tokenize(str) str_processed = "" for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: str_processed += token + " " + self.stemmer.stem(token) + " " return str_processed def concatenateString(self, paragraph, start, length): final_string = paragraph[start] for i in range(1, length): final_string += " " + paragraph[start + i] return final_string def get_answer_canditates(self, paragraph): para_sents = nltk.sent_tokenize(paragraph) candidates = [] for sent in para_sents: para_words = sent.split() for i in range(0, len(para_words)): for j in range(1, min(15, len(para_words) - i + 1)): candidate = self.concatenateString(para_words, i, j) candidates.append(candidate) return candidates def read(self, P, Q): Q = self.stem_string(Q) query_tfidf = self.vectorizer.transform([Q]) similarities_raw = cosine_similarity(self.count_matrix, query_tfidf) similarities = [] for s in similarities_raw: similarities.append(s[0]) max_index = np.argmax(similarities) return self.docs[max_index]
class Arabic_preprocessing: def __init__(self): #preparing arabic emojis lexicon emojis_lexicon = pd.read_csv( 'resources/emoji_lexicon.csv', header=None, names=['emoji', 'utf-8', 'unicode', 'arabic_translation']) self.emojis_lexicon_dict = dict() for index, row in emojis_lexicon.iterrows(): self.emojis_lexicon_dict[row['emoji'].replace( ' ', '')] = row['arabic_translation'] #preparing arabic stop words list f = open('resources/arabic_stop_words.txt', 'r', encoding="utf8") # read stop words from file self.stop_words = [ line.strip() for line in f.readlines() ] # construct list of stop wrods and remove the new line character #preparing punctuations list arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' english_punctuations = string.punctuation self.all_punctuations = set(arabic_punctuations + english_punctuations) # initializing the stemmer self.stemmer = ARLSTem() # requires minimum NLTK version of 3.2.5 self.arabic_diacritics = re.compile( """ ّ | # Tashdid َ | # Fatha ً | # Tanwin Fath ُ | # Damma ٌ | # Tanwin Damm ِ | # Kasra ٍ | # Tanwin Kasr ْ | # Sukun ـ # Tatwil/Kashida """, re.VERBOSE) def normalize_arabic(self, text): text = re.sub("[إأآاٱ]", "ا", text) text = re.sub("ى", "ي", text) #text = re.sub("ؤ", "ء", text) #text = re.sub("ئ", "ء", text) text = re.sub("ة", "ه", text) # replace ta2 marboota by ha2 text = re.sub("گ", "ك", text) text = re.sub("\u0640", '', text) # remove tatweel return text def remove_punctuations(self, text): for p in self.all_punctuations: if p in text: text = text.replace(p, '') return text def remove_diacritics(self, text): text = re.sub(self.arabic_diacritics, '', text) return text def remove_repeating_char(self, text): return re.sub(r'(.)\1+', r'\1', text) def remove_mention(self, text): return re.sub(r'@\S+', '', text) def hashtag_match(self, match_object): return match_object.group(1).replace('_', ' ') def normalize_hashtag(self, text): return re.sub(r'#(\S+)', self.hashtag_match, text) def emojis_match(self, match_object): return ' ' + ' '.join(list(match_object.group(1))) + ' ' def separate_emojis(self, text): emojis_unicode = r'([\U0001F600-\U0001F64F\U00002000-\U00003000]+)' return re.sub(emojis_unicode, self.emojis_match, text) def replace_emojis(self, text): new_text = "" for l in text: new_text += self.emojis_lexicon_dict[ l] if l in self.emojis_lexicon_dict.keys() else l return new_text def remove_english_characters(self, text): return re.sub(r'[a-zA-Z]+', '', text) def clean_stop_words(self): # normalize, and remove diacritics from, stop words to increase posibility of matching with normalized data self.stop_words = [ self.remove_diacritics(self.normalize_arabic(word)) for word in self.stop_words ] def preprocess_arabic_text(self, text, stem=True, replace_emojis=True, normalize_arabic=True): self.clean_stop_words() text = text.replace('\\n', ' ').replace('\n', ' ') text = self.remove_mention(text) text = self.normalize_hashtag(text) text = self.remove_punctuations(text) text = self.remove_diacritics(text) if normalize_arabic: text = self.normalize_arabic(text) text = self.separate_emojis(text) if replace_emojis: text = self.replace_emojis(text) text = self.remove_repeating_char(text) text = self.remove_english_characters(text) words = nltk.word_tokenize(text) words = [word for word in words if word not in self.stop_words] if stem: words = [self.stemmer.stem(word) for word in words] return ' '.join(words) # return sentence (str), not list of words
class SWDbasline: def __init__(self): self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' def tokenize_string(self, str): str_tokens = self.tokenizer.tokenize(str) tokens_stemmed = [] for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: tokens_stemmed.append(self.stemmer.stem(token)) return tokens_stemmed def IC(self, w, P): return log(1 + 1 / self.C(w, P), 2) def C(self, w, P): sum = 0 for word in P: if word == w: sum += 1 return sum def sliding_window_helper(self, P, Q, A): res = [] for i in range(0, len(A)): S = list(set().union(Q, A[i])) cur = 0 for j in range(0, len(P) - len(S) + 1): sum = 0 for w in range(0, len(S)): if P[j + w] in S: sum += self.IC(P[j + w], P) cur = max(cur, sum) res.append(cur) return res def sliding_window(self, P, Q, A): return self.sliding_window_helper(self.tokenize_string(P), self.tokenize_string(Q), A) def dist(self, P, q, a): res = len(P) + 1 for i in range(0, len(P)): if P[i] == q or P[i] == a: if P[i] == q: a, q = q, a index = self.find_after(P, q, i) if index != -1: res = min(res, index - i) return res def find_after(self, L, w, i): for j in range(i, len(L)): if (L[j] == w): return j return -1 def distance_based_helper(self, P, Q, A): res = [] U = set(stopwords.words('arabic')) & set(P) SQ = list(set(P) & set(Q) - U) for i in range(0, len(A)): SA = list(((set(A[i]) & set(P)) - set(Q)) - U) d = len(P) + 1 if (len(SQ) == 0 or len(SA) == 0): d = 1 else: for q in SQ: for a in SA: d = min(d, self.dist(P, q, a)) d *= 1 / (len(P) - 1) res.append(d) return res def distance_based(self, P, Q, A): return self.distance_based_helper(self.tokenize_string(P), self.tokenize_string(Q), A) def argmax(self, l): return l.index(max(l)) def SW(self, P, Q, A): return self.argmax(self.sliding_window(P, Q, A)) def concatenateString(self, paragraph, start, length): final_string = paragraph[start] for i in range(1, length): final_string += " " + paragraph[start + i] return final_string def get_answer_canditates(self, paragraph): candidates = nltk.sent_tokenize(paragraph) return candidates def read_score(self, P, Q): """ Implemnts SWD algorithm :param P: paragraph string :param Q: question string :return: answer index """ A = self.get_answer_canditates(P) ret_sw = self.sliding_window(P, Q, A) ret_d = self.distance_based(P, Q, A) max_indx = self.argmax([x - y for x, y in zip(ret_sw, ret_d)]) max_val = max([x - y for x, y in zip(ret_sw, ret_d)]) return A[max_indx], abs(max_val) def read(self, P, Q): """ Implemnts SWD algorithm :param P: paragraph string :param Q: question string :return: answer index """ A = self.get_answer_canditates(P) ret_sw = self.sliding_window(P, Q, A) ret_d = self.distance_based(P, Q, A) return A[self.argmax([x - y for x, y in zip(ret_sw, ret_d)])]