def stemming(data): """Stemming""" stemmer = ItalianStemmer() filtered = [] for word in data: filtered.append(stemmer.stem(word)) return filtered
def item_preprocessing(descr_text): """ Preprocess of a text string :param descr_text: :return: """ # Tokenizing the string by excluding all the special characters that are not alphanumeric and underscore tokenizer = RegexpTokenizer(r'\w+') token_list = tokenizer.tokenize(descr_text.lower()) # Creating a italian step words dictionary stop_words = set(stopwords.words('italian')) # Creating both italian and english stemmers due to the particular dataset with mixed languages ita_stemmer = ItalianStemmer() eng_stemmer = PorterStemmer() # Removing stop words filtered_token = [token for token in token_list if not token in stop_words] # Removing tokens composed by only a number filtered_token = [ token for token in filtered_token if not re.search(r'\b[0-9]+\b\s*', token) ] # Stemming the tokens for both italian and english filtered_token = [ita_stemmer.stem(token) for token in filtered_token] filtered_token = [eng_stemmer.stem(token) for token in filtered_token] filtered_token = FreqDist(filtered_token).most_common(50) return filtered_token
def preprocess(text, NUM_DOCS, num_preprocessed, stemming): global i if i == 0: i = num_preprocessed i += 1 result = [] stemmer = ItalianStemmer() if i % 20 == 0: print(f"\t{i} out of {NUM_DOCS+num_preprocessed} documents preprocessed") nlp = Italian() t0 = text.split("Lingua processuale")[0].split("Sentenza")[-1] t1 = "".join(t0) t1 = re.sub(r"’|'|«|»|\d{1,4}\/\d{1,4}\/(cee|ce)|\d+|---\|*|^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$", " ", t1, flags=re.IGNORECASE) # print(t1) doc = nlp(t1) for token in doc: if token.text.lower() not in it_stopwords and not token.is_punct | token.is_space and len(token) > 3: assert token.lang_ == "it" if stemming: result.append(stemmer.stem(word=token.text)) else: result.append(token.lemma_.lower()) if "'" in result[-1] or "’" in result[-1]: raise Exception(f"Detected_ {token.lemma_}") return result
def stemm(reviews): stemm_reviews = [] stemmer = ItalianStemmer() for review in reviews: clean = (' '.join([stemmer.stem(w) for w in review])) stemm_reviews.append(clean) return stemm_reviews
def clean_stop_words(df, column, lang, stem=True): """ (df,str,str) -> df cleans column of dataframe from stopwords with the given language :param df: dataframe to clean :param column: column of dataframe to clean :param lang: language of stopwords :param stem: if stemming activated :return: cleaned dataframe """ for i in range(df.shape[1]): df.loc[i, column] = re.sub('[^a-zA-Z]', ' ', df[column][i]) document = df[column].str.lower().str.split() sentence_stem = [] document_stem = [] nltk_stop = stopwords.words(lang) clean_document = document.apply( lambda x: [item for item in x if item not in nltk_stop]) stemmer = ItalianStemmer() if stem: for sentence in clean_document: for word in sentence: word = stemmer.stem(word) sentence_stem.append(word) document_stem.append(sentence_stem) sentence_stem = [] sentences = [' '.join(i) for i in document_stem] cleaned_series = pd.Series((v for v in sentences)) df[column] = cleaned_series else: sentences = [' '.join(i) for i in clean_document] cleaned_series = pd.Series((v for v in sentences)) df[column] = cleaned_series return df
def __init__(self): self.data_dir = "./data/" if not isdir(self.data_dir): os.mkdir(self.data_dir) self.info = None self.desc = None self.desc_index = None self.stemmer = ItalianStemmer() self.stop_words = set(stopwords.words('italian')) self.vocab = None self.documents = None self.inv_index = None # no inverted index yet available self.idf = None # no inverse document frequency yet available self.nltk_check_downloaded() self.url_base = "https://www.immobiliare.it" self.url_search = "/vendita-case/roma/?criterio=rilevanza&pag=" try: html = requests.get(self.url_base + self.url_search + "1").content soup = BeautifulSoup(html, "html.parser") pag_number_list = soup.find("ul", class_="pagination pagination__number") self.max_pag_nr = int(pag_number_list.find_all("li")[-1].text) except requests.exceptions.ConnectionError: pass
def stem_words(wrd): stemmer = ItalianStemmer() # Selects the stemmmer from nltk stems = [] # List of updated words for word in wrd: stem = stemmer.stem(word) # Stems the word stems.append(stem) # and appends it to the list return stems
def preprocess_string(s, lower=True, stem=True, remove_stopwords=True, remove_punctuation=True): """ Cleanup a string Keyword arguments: s -- the input string lower -- lower every char stem -- extract root of every word remove_stopwords -- well, self-explanatory remove_punctuation -- self-explanatory too """ # lower every char # entity recognition also uses upper chars if lower: s = s.lower() # replace accents accent_chars = { "è": "e", "é": "e", "à": "a", "ò": "o", "ó": "o", "ù": "u", "ì": "i", } for char in accent_chars: s.replace(char, accent_chars[char]) # tokenize and remove punctuation from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r"\w+") s = tokenizer.tokenize(s) # stem the words if stem: from nltk.stem.snowball import ItalianStemmer stemmer = ItalianStemmer() s = [stemmer.stem(word) for word in s] # remove stopwords (italian) if remove_stopwords: from stop_words import get_stop_words s = [word for word in s if word not in get_stop_words('it')] return " ".join(s)
def preprocessQuery(query): # initialize tokenizer, stop words stemmer tokenizer = RegexpTokenizer(r'\w+') stopWords = set(stopwords.words('italian')) stemmer = ItalianStemmer() rawText = query.lower() #tokenize tokens = tokenizer.tokenize(rawText) # remove stop words effectiveTokens = [t for t in tokens if not t in stopWords] # stemming result = [stemmer.stem(t) for t in effectiveTokens] return result
def __init__(self, no_numbers=True, min_length=1, clean_emoji=True, stop_words_bool=True, whitelist_stop_words=True, stemmer=False): self.no_numbers = no_numbers self.min_length = min_length self.stemmer = stemmer self.ita_stemmer = ItalianStemmer() self.replace = [ '#', '>', '_', '<', '-', '|', '\\', '/', '^', '\n', '”', '“', '"', '’', '‘', '€', '´', '.', '…' ] self.emoji = None self.clean_emoji = clean_emoji if self.clean_emoji: try: # UCS-4 self.emoji = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # UCS-2 self.emoji = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') self.pos_emoji = [ '👍', '😀', '💪', '😎', '👌', '😁', '😃', '😄', '😊', '😋', '😍', '😻', '🤗', '👏🏻', '😘', '🎉', '💗', '🔝', '😉' ] self.neg_emoji = [ '👎', '😒', '😖', '😠', '😡', '😤', '😨', '😱', '😳', '😬', '😞', '🤐', '😕', '😢' ] self.stop_words_bool = stop_words_bool if self.stop_words_bool: sw = StopWords(whitelist=whitelist_stop_words) sw_list = sw.getStopWords() stop_words_dict = defaultdict(lambda: -1) for i, word in enumerate(sw_list): stop_words_dict[word] = 1 self.stop_words_dict = stop_words_dict return
def __init__(self, stopwords=None, punct=None, lower=True, strip=True, Language='English'): self.lower = lower self.strip = strip self.punct = punct or set(string.punctuation) if Language == 'English': self.stopwords = stopwords or set(sw.words('english')) self.stemmer = PorterStemmer() elif Language == 'Italian': self.stopwords = stopwords or set(sw.words('italian')) self.stemmer = ItalianStemmer()
def get_TFIDFmatrix_vect(data, do_stemming): """ data: input textual collection do_stemming: boolean. If True execute stemming, otherwise analyze only tokenized words (words are composed at least 2 chars and do not contains numbers) returns a tuple <tf, matrix> where tf is the vectorizer and matrix is the normalized matrix of tfidf """ min_df = 10 if do_stemming: italian_stemmer = ItalianStemmer() tf = textual_analysis.StemmedCountVectorizer( token_pattern=u'([a-z]{2,})', min_df=min_df, analyzer="word", stop_words=stopwords.words('italian'), norm='l2') else: tf = TfidfVectorizer( token_pattern=u'([a-z]{2,})', sublinear_tf=True, use_idf=True, stop_words=stopwords.words('italian'), max_df=0.1, min_df=min_df, norm='l2' ) #CountVectorizer supports counts of N-grams of words or consecutive characters. matrix = tf.fit_transform(data) return matrix, tf
def preprocessData(documets): # initialize tokenizer, stop words stemmer tokenizer = RegexpTokenizer(r'\w+') stopWords = set(stopwords.words('italian')) stemmer = ItalianStemmer() texts = [] # loop through document list for doc in documets: rawText = doc.lower() #tokenize tokens = tokenizer.tokenize(rawText) # remove stop words effectiveTokens = [t for t in tokens if not t in stopWords] # stemming stemmedTokens = [stemmer.stem(t) for t in effectiveTokens] # add tokens to list texts.append(stemmedTokens) return texts
def remove_step(doc): """ takes as input the string of the document removes stopwords, punctuation and makes stemming input: - string of document output: - list of term after stemming process """ # check if it's a nan value if isinstance(doc, float): return str(doc) sp = string.punctuation + '“”–’°•€' doc = doc.replace("\\n", " ") # punctuations doc = [c if c not in sp else " " for c in doc] doc = ''.join(doc) # stopwords doc = [ word for word in doc.split() if word.lower() not in stopwords.words('italian') ] doc = ' '.join(doc) # stemming ps = ItalianStemmer() words = word_tokenize(doc) w_lst = [] for w in words: w_lst.append(ps.stem(w)) # something else return ' '.join(w_lst)
class NLTKPreprocessor(BaseEstimator, TransformerMixin): def __init__(self, stopwords=None, punct=None, lower=True, strip=True, Language='English'): self.lower = lower self.strip = strip self.punct = punct or set(string.punctuation) if Language == 'English': self.stopwords = stopwords or set(sw.words('english')) self.stemmer = PorterStemmer() elif Language == 'Italian': self.stopwords = stopwords or set(sw.words('italian')) self.stemmer = ItalianStemmer() def fit(self, X, y=None): return self def inverse_transform(self, X): return [" ".join(doc) for doc in X] def transform(self, X): return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue if token in self.stopwords: continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.stemmer.stem(token) yield lemma
def myTokenizer(self, corpus, no_numbers=True, min_length=1, stop_words_bool=False, stemmer=False): tokens = [] unique_words = set() replace = [ '>', '<', '-', '|', '\\', '/', '^', '\n', '”', '“', '"', '’', '...' ] ita_stemmer = ItalianStemmer() for doc in corpus: if no_numbers: doc = re.sub(r'\d+', '', doc) for punct in string.punctuation: doc = doc.replace(punct, " ") for specialChar in replace: doc = doc.replace(specialChar, ' ') split_doc = [ token.lower().strip() for token in doc.split(" ") if token ] split_doc = [word for word in split_doc if len(word) > min_length] if stemmer: split_doc = [ita_stemmer.stem(word) for word in split_doc] unique_words.update(set(split_doc)) tokens.append(split_doc) return tokens, unique_words
def description_preproc(description): description = description.strip() description = description.replace("\n", " ") description = description.replace('\r', " ") description = description.replace('’', " ") sp = string.punctuation + '“”–’°•€' punctuation_remover = str.maketrans('', '', sp) description = description.split(' ') # removing punctuation description = [word.translate(punctuation_remover) for word in description] #removing empty spaces in the list description = filter(None, description) # Italian stemmer stemmer = ItalianStemmer() # stemmed list stemmed_list = [stemmer.stem(word) for word in description] return ' '.join(stemmed_list)
def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language): if language=='en': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() elif language=='fr': stop_words = set(stopwords.words('french')) stemmer = FrenchStemmer() elif language=='es': stop_words = set(stopwords.words('spanish')) stemmer = SpanishStemmer() elif language=='it': stop_words = set(stopwords.words('italian')) stemmer = ItalianStemmer() corpus = [] doc_indexes = [] for key,value in documents.items(): doc_indexes.append(key) doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words] corpus.append(value.split(" ")) bm25 = BM25Okapi(corpus) print("Running BM25",flush=True) results = dict() for i,elem in enumerate(train): results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) if i%1000==0: print('Processing query',i,'/',len(train),flush=True) save_BM25_res(output_dir+'/training/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True) results = dict() for elem in validation: results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) save_BM25_res(output_dir+'/validation/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False) results = dict() for elem in test: results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) save_BM25_res(output_dir+'/test/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)
class StemmedTfidf(TfidfVectorizer): def __init__( self, input="content", encoding="utf-8", decode_error="ignore", strip_accents=None, lowercase=True, stop_file="stopwords_it.txt", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False, ): self.stemmer = ItalianStemmer() stopwords = [line.strip() for line in codecs.open(stop_file, "r", "utf-8")] super(StemmedTfidf, self).__init__( input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, analyzer=self.stemmed_analyzer, stop_words=stopwords, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype, ) def stemmed_analyzer(self, document): tokens = wordpunct_tokenize(self.decode(document)) stopwords = self.get_stop_words() stems = [self.stemmer.stem(token) for token in tokens if token.isalpha() and token not in stopwords] return stems
class StemTokenizer(object): def __init__(self): self.stemmer = ItalianStemmer() def __call__(self, document): lemmas = [] for t in word_tokenize(document, language='italian'): t = t.strip() # leading whitespaces are eliminated lemma = self.stemmer.stem(t) # Stemmer # filter stopwords if t not in stopwords: # and detect(t) == 'it' # to detect language if (len(lemma) > 2) and (len(lemma) < 16): lemmas.append(lemma) # allow words in the whitelit if t in whitelist: lemmas.append(lemma) return lemmas
def __init__( self, input="content", encoding="utf-8", decode_error="ignore", strip_accents=None, lowercase=True, stop_file="stopwords_it.txt", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False, ): self.stemmer = ItalianStemmer() stopwords = [line.strip() for line in codecs.open(stop_file, "r", "utf-8")] super(StemmedTfidf, self).__init__( input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, analyzer=self.stemmed_analyzer, stop_words=stopwords, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype, )
def run_BM25_query(query,bm25,doc_indexes,k,language): if language=='en': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() elif language=='fr': stop_words = set(stopwords.words('french')) stemmer = FrenchStemmer() elif language=='es': stop_words = set(stopwords.words('spanish')) stemmer = SpanishStemmer() elif language=='it': stop_words = set(stopwords.words('italian')) stemmer = ItalianStemmer() tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words] doc_scores = bm25.get_scores(tokenized_query) top_k = np.argsort(doc_scores)[::-1][:k] results = [[doc_indexes[key],doc_scores[key]] for key in top_k] return results
import sys import logging import logging.config logging.config.fileConfig('logging.conf') logging.getLogger('chatbot') import nltk logging.info('nltk [imported]') from nltk.stem.snowball import ItalianStemmer stemmer = ItalianStemmer() logging.info('ItalianStemmer [imported]') # import our chat-bot intents file import json with open('assets/intents.json', encoding='utf-8') as json_data: intents = json.load(json_data) logging.info('intents [loaded]') words = [] classes = [] documents = [] ignore_words = ['?', ',', '.'] # loop through each sentence in our intents patterns logging.info('Loop on intents...') for intent in intents['intents']: for pattern in intent['patterns']: logging.debug('Evaluate pattern: ' + pattern) # tokenize each word in the sentence w = nltk.word_tokenize(pattern) print('Tokenize words:', w)
def __init__(self): self.stemmer = ItalianStemmer()
class AdCluster: def __init__(self): self.data_dir = "./data/" if not isdir(self.data_dir): os.mkdir(self.data_dir) self.info = None self.desc = None self.desc_index = None self.stemmer = ItalianStemmer() self.stop_words = set(stopwords.words('italian')) self.vocab = None self.documents = None self.inv_index = None # no inverted index yet available self.idf = None # no inverse document frequency yet available self.nltk_check_downloaded() self.url_base = "https://www.immobiliare.it" self.url_search = "/vendita-case/roma/?criterio=rilevanza&pag=" try: html = requests.get(self.url_base + self.url_search + "1").content soup = BeautifulSoup(html, "html.parser") pag_number_list = soup.find("ul", class_="pagination pagination__number") self.max_pag_nr = int(pag_number_list.find_all("li")[-1].text) except requests.exceptions.ConnectionError: pass def load_data(self, info_fname, desc_fname, convert_to_tfidf=True, skip_scrape=False): info_file = self.data_dir + info_fname desc_file = self.data_dir + desc_fname info_exists = isfile(info_file) desc_exists = isfile(desc_file) if info_exists and desc_exists: info, desc = pd.read_csv(info_file, sep=",", index_col=None, header=None), \ pd.read_csv(desc_file, sep=",", index_col=None, header=None) elif not skip_scrape: info, desc = self.scrape_immobiliare() else: raise ValueError(f"No files present and 'skip_scrape'={skip_scrape}.") info.drop(columns=[0], inplace=True) info.columns = ['ID', 'Price', 'Rooms', 'Area', 'Bathrooms', 'Floor'] desc.drop(columns=[0, 2], inplace=True) desc.columns = ['ID', 'Description'] info.reset_index(drop=True, inplace=True) desc.reset_index(drop=True, inplace=True) info[info["Floor"] == "A"] = 12 info[info["Floor"].isin(("R", "T"))] = 0 info[info["Floor"] == "S"] = -1 # remove duplicates info = info.loc[(-1 * info["ID"].duplicated(keep=False) + 1).astype(bool)] desc = desc.loc[(-1 * desc["ID"].duplicated(keep=False) + 1).astype(bool)] desc_ids = desc["ID"] info_ids = info["ID"] info_corr = info_ids[info_ids.isin(desc_ids)] desc_corr = desc_ids[desc_ids.isin(info_ids)] rem_ids = pd.unique(pd.concat((info_corr, desc_corr))) info = info[info["ID"].isin(rem_ids)] desc = desc[desc["ID"].isin(rem_ids)] nans = lambda df: df.isnull().any(axis=1) # handy func to find NANs nan_info = nans(info) nan_desc = nans(desc) # drop all ads where any of the two matrices encounter NANs info = info.drop(index=info.index[nan_info | nan_desc]).reset_index(drop=True) desc = desc.drop(index=desc.index[nan_info | nan_desc]).reset_index(drop=True) if convert_to_tfidf: desc = self.build_desc_matrix(desc) self.info = info self.desc = desc return info, desc @staticmethod def get_ad_from_url(url, parser): response = requests.get(url) html_soup = BeautifulSoup(response.text, parser) ad_containers = html_soup.find_all('p', class_='titolo text-primary') urls = [] for container in ad_containers: if "/nuove_costruzioni/" not in container.a['href']: urls.append(container.a['href']) return urls @staticmethod def get_data(url): id = re.findall(r'(\d+)', url)[0] # Get ad ID parsing the url response = requests.get(url) html_soup = BeautifulSoup(response.text, 'html.parser') data_container = html_soup.find('ul', class_='list-inline list-piped features__list') if data_container is not None: find = lambda itm: itm.find('div', class_='features__label') for item in data_container.children: # Locate rooms number found = find(item) if found: if found.contents[0] == 'locali': rooms = item.find('span', class_='text-bold').contents[0] rooms = re.sub('[^A-Za-z0-9]+', '', rooms) # Locate surface extension elif found.contents[0] == 'superficie': area = item.find('span', class_='text-bold').contents[0] area = re.sub('[^A-Za-z0-9]+', '', area) # Locate bathrooms number elif found.contents[0] == 'bagni': bathrooms = item.find('span', class_='text-bold').contents[0] bathrooms = re.sub('[^A-Za-z0-9]+', '', bathrooms) # Locate floor number elif found.contents[0] == 'piano': floor = item.find('abbr', class_='text-bold').contents[0] floor = re.sub('[^A-Za-z0-9]+', '', floor) # Extract the description try: cl = 'col-xs-12 description-text text-compressed' description = html_soup.find('div', class_=cl).div.contents[0] description = re.sub('[^a-zA-Z0-9-_*. ]', '', description) # Remove special characters description = description.lstrip(' ') # Remove leading blank spaces except AttributeError: return False try: return [[id, rooms, area, bathrooms, floor], [id, description]] except NameError: return False def scrape_immobiliare(self): row_info, row_desc, url_list = [], [], [] try: import lxml parser = "lxml" except ImportError: parser = "html.parser" base_url = "https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=" for i in tqdm(range(450)): url_list += self.get_ad_from_url(base_url + str(i), parser) for url in tqdm(url_list): print(url) # This while loop is needed to retry the request in case of connection error while True: try: cont = self.get_data(url) if cont: # Convert list in dataframe row_data = np.asarray(cont[0]).reshape(1, 5) row_data = pd.DataFrame(data=row_data, columns=['ID', 'Rooms', 'Area', 'Bathrooms', 'Floor']) # Append results to info dataframe row_info.append(row_data) # Convert list in dataframe row_description = pd.np.asarray(cont[1]).reshape(1, 2) row_description = pd.DataFrame(data=row_description, columns=['ID', 'Description']) # Append results to description dataframe row_desc.append(row_description) # Create two csv files line by line with open('data/data.csv', 'a') as f: row_data.to_csv(f, header=False) with open('data/description.csv', 'a') as f: row_description.to_csv(f, header=False) # Wait a second in case of connection error and retry except ConnectionError: print('Connection Error') time.sleep(1) continue break info = pd.concat(row_info) desc = pd.concat(row_desc) # remove duplicates info = info.loc[(-1 * info["ID"].duplicated(keep=False) + 1).astype(bool)] desc = desc.loc[(-1 * desc["ID"].duplicated(keep=False) + 1).astype(bool)] desc_ids = desc["ID"] info_ids = info["ID"] info_corr = info_ids[info_ids.isin(desc_ids)] desc_corr = desc_ids[desc_ids.isin(info_ids)] rem_ids = pd.unique(pd.concat((info_corr, desc_corr))) info = info[info["ID"].isin(rem_ids)] desc = desc[desc["ID"].isin(rem_ids)] nans = lambda df: df.isnull().any(axis=1) # handy func to find NANs nan_info = nans(info) nan_desc = nans(desc) # drop all ads where any of the two matrices encounter NANs info = info.drop(index=info.index[nan_info | nan_desc]) desc = desc.drop(index=desc.index[nan_info | nan_desc]) return info, desc @timeit def build_desc_matrix(self, desc_df): self.desc_index = desc_df.index docs = desc_df["Description"] docs = self._process_docs(docs) self._build_invert_idx(docs, proc=False) # In the following, the one-hot-encoding of the relevant documents is computed # and its tfidf values stored in sparse matrix. col = [] # list of non zero column indices row = [] # list of non zero row indices data = [] # data of the non zero indices for d_nr, content in docs.items(): for term in content: col.append(self.vocab.loc[term, "term_id"]) row.append(d_nr) # find the tfidf (the data) of the term in this document found = False for termset in self.inv_index[term]: if termset.docID == d_nr: data.append(termset.tfidf) found = True break # value found, no other termset needs to be found after if not found: raise ValueError(f"Term {term} in document {d_nr} not found.") shape = len(docs), len(self.vocab) desc_sparse = sparse.csr_matrix((data, (row, col)), shape=shape, dtype=float) return desc_sparse def desc_sparse_to_dense(self, desc_sparse): if isinstance(desc_sparse, sparse.csr_matrix): return pd.DataFrame(desc_sparse.toarray(), index=self.desc_index, columns=self.vocab.index) else: return desc_sparse @staticmethod def cluster_kmeans_elbow(X, normalize_=False): if normalize_: X_clust = normalize(X) else: X_clust = X i = 0 ks, fits, scores = [], [], [] while True: new_range = [k for k in range(10 * i + 1, 10 * i + 11)] ks += new_range KM = [KMeans(n_clusters=i) for i in new_range] f = [km.fit(X_clust) for km in KM] fits += f scores += [km.inertia_ for km in f] plt.plot(ks, scores) plt.show() print("Choose number of clusters: ", end="") new_k = input() if new_k != "": try: new_k = int(new_k) if new_k > 0: break except ValueError: pass i += 1 km_fit = fits[new_k-1] return km_fit def find_similar_clusters(self, clusters_info, clusters_desc): if self.info is None or self.desc is None: raise ValueError("Information and/or description dataframe not yet assigned.") labels_info = clusters_info.predict(self.info) labels_desc = clusters_desc.predict(self.desc) n_clusters_info = clusters_info.n_clusters n_clusters_desc = clusters_desc.n_clusters cluster_sim = heapdict.heapdict() for i in range(n_clusters_info): ind_info = np.where(labels_info == i)[0] for j in range(n_clusters_desc): ind_desc = np.where(labels_desc == j)[0] all_ind = np.concatenate((ind_info, ind_desc)) intersec = 0 if len(ind_info) < len(ind_desc): for idx in ind_info: if idx in ind_desc: intersec += 1 else: for idx in ind_desc: if idx in ind_info: intersec += 1 union = np.unique(all_ind) cluster_sim[i, j] = -intersec / len(union) return cluster_sim def top_words_clusters(self, data, labels, nr_top_k_words): top_data = data.apply( lambda x: pd.Series(x.sort_values(ascending=False).iloc[:nr_top_k_words].index, index=[f"top{i}" for i in range(1, nr_top_k_words + 1)]), axis=1 ) _, desc_df = self.load_data("data.csv", "description.csv", convert_to_tfidf=False) desc_df = self._process_docs(desc_df["Description"], stem=False) top_data["cluster"] = labels top_data.sort_values(by=["cluster"], inplace=True) for cluster in pd.unique(top_data["cluster"]): d = top_data[top_data["cluster"] == cluster].drop(columns=["cluster"]) freqs = dict() for x in d.itertuples(): idx = x.Index actual_ad = desc_df[idx] words = x[1:] for word in words: for act_w in actual_ad: if self.stemmer.stem(act_w) == word: actual_word = act_w break freqs[actual_word] = data.loc[idx, word] wordcloud = WordCloud(width=1600, height=800, background_color="white") wordcloud.generate_from_frequencies(freqs) plt.figure(num=None, figsize=(20, 10), facecolor='w', edgecolor='k') plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.title(f"Cluster {cluster} word-cloud of top {nr_top_k_words} words of each ad within cluster.\n" f"The size of words corresponds to their TFIDF value.") plt.show() return top_data @timeit def _create_vocab(self, docs, proc=True): """ Creates the vocabulary from documents or reads the vocabulary from file. The name is always "vocabulary.csv" containing the word as index and its term id as column entry. :param docs: dict or pd.DataFrame, the collection of documents (only essential parts) :return: the vocabulary """ fname = f"{self.data_dir}vocabulary.csv" if proc: docs = self._process_docs(docs) self.vocab = set() for doc in docs.values(): self.vocab.update(doc) self.vocab = pd.DataFrame(pd.Series(np.arange(len(self.vocab)), index=self.vocab), columns=["term_id"]) self.vocab.to_csv(fname) return self.vocab def _process_text(self, text, stem=True): """ Remove special characters and superfluous whitespaces from text body. Also send text to lower case, tokenize and stem the terms. :param text: str, the text to process. :return: generator, yields the processed words in iteration """ if stem: stem_func = self.stemmer.stem else: stem_func = lambda x: x text = self.doc_to_string(text).lower() sub_re = r"[^A-Za-z']" text = re.sub(sub_re, " ", text) for i in word_tokenize(text): if i not in self.stop_words: w = stem_func(i) if len(w) > 1: yield(w) def _process_docs(self, docs=None, stem=True): """ Takes a collection of documents and processes them iteratively. The docs can be a pd.DataFrame, pd.Series or dictionary. :param docs: pd.DataFrame, pd.Series or dictionary :return: dict, indexed by doc number and lists (processed doc) as values """ if isinstance(docs, pd.DataFrame): docs_generator = docs.iterrows() elif isinstance(docs, pd.Series): docs_generator = docs.iteritems() elif isinstance(docs, dict): docs_generator = docs.items() else: raise ValueError("Container type has no handler.") d_out = dict() for docnr, doc in docs_generator: d_out[docnr] = list(self._process_text(doc, stem=stem)) return d_out @staticmethod def nltk_check_downloaded(): """ Check the prerequisite NLTK tools, download if not found """ try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') @staticmethod def doc_to_string(doc): """ Converts a document to a string. Can take a list, DataFrame, tuple to convert to str :param doc: iterable, container of the document :return: str, the to string converted document. """ if isinstance(doc, str): return doc elif isinstance(doc, np.ndarray): doc = " ".join(list(map(str, doc.flatten()))) elif isinstance(doc, (list, tuple)): doc = " ".join(doc) elif isinstance(doc, (pd.DataFrame, pd.Series)): doc = " ".join(list(map(str, doc.values.flatten()))) else: raise ValueError(f"Can't convert file type {type(doc)} to string.") return doc @timeit def _build_invert_idx(self, docs=None, proc=False, read_fname="inverted_index.txt", write_fname="inverted_index.txt", load_from_file=False): """ Build the inverted index for the terms in a collection of documents. Will load a previously build inverted index from file if it detects the file existing (and param load_from_file is True). :param docs: pd.DataFrame/dict, collection of documents :param read_fname: str, filename of the inverted txt to load. Needs to be built in the specified way of the method :param write_fname: str, filename to write the inverted index to. :param load_from_file: bool, load the index from the filename provided if True :return: dict, the inverted index with terms as keys and [TermSet(docID, tfidf),...] as values. """ if self.vocab is None: self._create_vocab(docs, proc=proc) file = f"{self.data_dir}{read_fname}" TermSet = namedtuple("TermSet", "docID tfidf") if isfile(file) and load_from_file: idf_dict = dict() inv_index = dict() with open(file, "r") as f: # load all the information from the file into memory for rowidx, line in enumerate(f): if rowidx > 0: term, idf_doclist = line.strip().split(":", 1) idf, doclist = idf_doclist.split("|", 1) idf_dict[term] = idf doclist = list(map(lambda x: re.search(r"\d+,\s?(\d[.])?\d+", x).group().split(","), doclist.split(";"))) inv_index[term] = [TermSet(*list(map(float, docl))) for docl in doclist] else: # the final inverted index container, defaultdict, so that new terms # can be searched and get an empty list back inv_index = defaultdict(list) docs, idf_dict, term_freqs, doc_counters = self._build_idf(docs, proc) for docnr, doc in docs.items(): # weird, frequency pairs for this document freqs = doc_counters[docnr] for word, word_freq in freqs.items(): # nr of words in this document n_terms = sum(freqs.values()) # store which document and frequency inv_index[word].append(TermSet(docnr, word_freq / n_terms * idf_dict[word])) # write the built index to file with open(f"{self.data_dir}{write_fname}", "w") as f: f.write("Word: [Documents list]\n") for word, docs in inv_index.items(): docs = [(doc.docID, doc.tfidf) for doc in docs] f.write(f"{word}: {idf_dict[word]} | {';'.join([str(doc) for doc in docs])}\n") self.inv_index = inv_index self.idf = idf_dict return inv_index @timeit def _build_idf(self, docs, proc=True): """ Builds the IDF values for terms in docs. :param docs: dict/pd.DataFrame, the documents :return: tuple; a tuple of (docs_dict, idf_dict, termFrequencies_dict, docCounters_dict). The idf_dict contains the IDF value for each term in the documents. The termFrequencies_dict contains the global number of occurences of each term in all the docs. the docCounters_dict contains the local number of occurences of each term in the respective doc. """ if proc: docs = self._process_docs(docs) nr_docs = len(docs) idf = defaultdict(lambda: np.math.log(len(docs) + 1)) # dict to track nr of occurences of each term term_freqs = dict() # dict to store counter of words in each doc doc_counters = dict() for docnr, doc in docs.items(): freqs = Counter(doc) doc_counters[docnr] = freqs for word in freqs.keys(): if word in term_freqs: term_freqs[word] += 1 else: term_freqs[word] = 1 for word in self.vocab.index: # nr of documents with this term in it nr_d_with_term = term_freqs[word] # inverse document frequency for this term and this document idf[word] = np.math.log((float(nr_docs + 1) / (1 + nr_d_with_term))) self.idf = idf return docs, idf, term_freqs, doc_counters
def stem_words(self, words): stemmer = ItalianStemmer() stemmed_words = [] for word in words: stemmed_words.append(stemmer.stem(word)) return stemmed_words
class MyTokenizer: # Constructor def __init__(self, no_numbers=True, min_length=1, clean_emoji=True, stop_words_bool=True, whitelist_stop_words=True, stemmer=False): self.no_numbers = no_numbers self.min_length = min_length self.stemmer = stemmer self.ita_stemmer = ItalianStemmer() self.replace = [ '#', '>', '_', '<', '-', '|', '\\', '/', '^', '\n', '”', '“', '"', '’', '‘', '€', '´', '.', '…' ] self.emoji = None self.clean_emoji = clean_emoji if self.clean_emoji: try: # UCS-4 self.emoji = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # UCS-2 self.emoji = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') self.pos_emoji = [ '👍', '😀', '💪', '😎', '👌', '😁', '😃', '😄', '😊', '😋', '😍', '😻', '🤗', '👏🏻', '😘', '🎉', '💗', '🔝', '😉' ] self.neg_emoji = [ '👎', '😒', '😖', '😠', '😡', '😤', '😨', '😱', '😳', '😬', '😞', '🤐', '😕', '😢' ] self.stop_words_bool = stop_words_bool if self.stop_words_bool: sw = StopWords(whitelist=whitelist_stop_words) sw_list = sw.getStopWords() stop_words_dict = defaultdict(lambda: -1) for i, word in enumerate(sw_list): stop_words_dict[word] = 1 self.stop_words_dict = stop_words_dict return def __call__(self, doc): doc = re.sub(r'[hHtTpP]+[sS]?:[A-Za-z0-9-#_./]+', ' ', doc) if self.no_numbers: doc = re.sub(r'\d+', ' ', doc) for punct in string.punctuation: doc = doc.replace(punct, " ") for specialChar in self.replace: doc = doc.replace(specialChar, ' ') if self.clean_emoji: for specialEmoji in self.pos_emoji: doc = doc.replace(specialEmoji, ' positiveemoji ') for specialEmoji in self.neg_emoji: doc = doc.replace(specialEmoji, ' negativeemoji ') doc = self.emoji.sub(u' ', doc) split_doc = [ token.lower().strip() for token in doc.split(" ") if token ] if self.stop_words_bool: split_doc = [ word for word in split_doc if len(word) > self.min_length and len(word) < 16 and self.stop_words_dict[word] != 1 ] else: split_doc = [ word for word in split_doc if len(word) > self.min_length and len(word) < 16 ] if self.stemmer: split_doc = [self.ita_stemmer.stem(word) for word in split_doc] return split_doc
import pickle import json import tflearn import tensorflow as tf import numpy as np import re import random import nltk from nltk.stem.snowball import ItalianStemmer stemmer = ItalianStemmer() data = pickle.load(open("training_data", "rb")) words = data['words'] classes = data['classes'] train_x = data['train_x'] train_y = data['train_y'] # import our chat-bot intents file with open('intents.json') as json_data: intents = json.load(json_data) def clean_up_sentence(sentence): sentence_words = nltk.word_tokenize(re.sub(r'[^\w\s]', ' ', sentence), language='italian') sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] return sentence_words