def __init__(self, model_type,part_type,ngram = 3): ngram_range = (ngram,ngram) self.part_type = part_type self.model_type = model_type if model_type == 'count': self.vectorizer = CountVectorizer(ngram_range=ngram_range) elif model_type == 'tfidf': self.vectorizer = TfidfVectorizer(ngram_range=ngram_range) else: print('model is not valid') exit(1) self.knn = None self.nlp = spacy.load('en_core_web_sm') self.normalizer = TextPipeline(self.nlp) path_data = config.path_data + self.part_type + '_3' with open(path_data, 'rb') as handle: data = pickle.load(handle) # vectorizer prende in input una stringa e non una lista di token #self.data_train = [ item['normalized'] for item in data] self.labels = [item['tag'] for item in data] self.path_knn = config.path_knn.format(model_type,part_type,ngram) self.path_vec = config.path_model.format(model_type,part_type,ngram) self.entryname = "{}_{}_{}".format(str.upper(model_type),part_type,ngram) print(self.path_knn) print(self.path_vec)
def __init__(self,type, k = '3'): nlp = spacy.load('en_core_web_'+config.size_nlp) self.k = k self.entryname = 'Minhash_K_'+self.k config.kGRAM = self.k self.normalizer = TextPipeline(nlp) self.permutation = config.permutations self.type = type self.model = None self.path_model = "" if self.type in 'trigram': self.path_model = config.path_models + "_" + self.type self.k = '3' else: self.path_model = config.path_models + "_" + self.type + "_" + self.k self.pathDataProc = config.pathDataProc.format(self.type,self.k)
def __init__(self, filepath="", part=""): self.filepath = filepath + "total_" + part + ".json" self.nlp = spacy.load('en_core_web_' + config.size_nlp) self.normalizer = TextPipeline(self.nlp) if part == "paragraph" or part == "section" or part == "trigram": if part == "trigram": # uso le frasi per l'estazione dei trigrammi self.filepath = filepath + "total_phrase.json" self.tag = part[0].upper() else: self.tag = "F" print(self.filepath) with open(self.filepath) as json_file: self.data = json.load(json_file) if self.tag == "T": print("TOTAL {}: {} (NUMERO DI FRASI)".format( part, self.data['total'])) else: print("TOTAL {}: {}".format(part, self.data['total']))
class TextVectorizer: def __init__(self, model_type,part_type,ngram = 3): ngram_range = (ngram,ngram) self.part_type = part_type self.model_type = model_type if model_type == 'count': self.vectorizer = CountVectorizer(ngram_range=ngram_range) elif model_type == 'tfidf': self.vectorizer = TfidfVectorizer(ngram_range=ngram_range) else: print('model is not valid') exit(1) self.knn = None self.nlp = spacy.load('en_core_web_sm') self.normalizer = TextPipeline(self.nlp) path_data = config.path_data + self.part_type + '_3' with open(path_data, 'rb') as handle: data = pickle.load(handle) # vectorizer prende in input una stringa e non una lista di token #self.data_train = [ item['normalized'] for item in data] self.labels = [item['tag'] for item in data] self.path_knn = config.path_knn.format(model_type,part_type,ngram) self.path_vec = config.path_model.format(model_type,part_type,ngram) self.entryname = "{}_{}_{}".format(str.upper(model_type),part_type,ngram) print(self.path_knn) print(self.path_vec) def save(self): with open(self.path_knn, 'wb') as handle: pickle.dump(self.knn, handle) with open(self.path_vec, 'wb') as handle: pickle.dump(self.vectorizer, handle) def load(self): with open(self.path_knn, 'rb') as handle: self.knn = pickle.load(handle) with open(self.path_vec, 'rb') as handle: self.vectorizer = pickle.load(handle) self.l_sign = len(self.vectorizer.get_feature_names()) print(self.vectorizer) def train(self): path_data = config.path_data + self.part_type + '_3' with open(path_data, 'rb') as handle: data = pickle.load(handle) # vectorizer prende in input una stringa e non una lista di token self.data_train = [item['normalized'] for item in data] print("==== TRAINING [ model = {}, part_type = {} ]".format(self.model_type,self.part_type)) matrix = self.vectorizer.fit_transform(self.data_train) self.l_sign = len(self.vectorizer.get_feature_names()) self.knn = NearestNeighbors(n_neighbors=config.num_recommendations, metric='cosine').fit(matrix) self.save() print("==== END ====================") pass def predict(self, query, threshold=config.default_threshold, N=config.num_recommendations, Trigram = False): start_time = time.time() query_norm = self.normalizer.convert(query, divNGram=False) query_vec = self.vectorizer.transform([query_norm]) dist, idx = self.knn.kneighbors(query_vec) idx = idx[0] timing_search = "%.2f ms" % ((time.time() - start_time) * 1000) if len(idx) == 0: res_json = [] else: res_json = [] for id in idx: item = metric(query_norm, self.labels[id], self.normalizer, Trigram=Trigram) if float(item['lev']) >= threshold: res_json += [item] # ====== RE-RANKING ========================================================= res_json = sorted(res_json, key=lambda i: i['lev'], reverse=True) timing = "%.2f ms" % ((time.time() - start_time) * 1000) return {'query': query, 'data': res_json, 'time': timing, 'max': N, 'time_search': timing_search, 'threshold': threshold, 'algoritm': self.entryname}
start_time = time.time() s1_hash = Simhash(s1, f=SGN_L) results = index_lsh.get_near_dups(s1_hash) timing_search = "%.2f ms" % ((time.time() - start_time) * 1000) if len(results) == 0: res_json = [] else: print('\n', timing_search) print('#Res: ', len(results)) import spacy from preprocess.text_pipeline import TextPipeline nlp = spacy.load('en_core_web_sm') normalizer = TextPipeline(nlp) query_norm = normalizer.convert(query, divNGram=False) res_json = [] for doc_retrival in results: item = metric(query_norm, doc_retrival, normalizer, Trigram=False) if float(item['lev']) >= 0.3: res_json += [item] # ====== RE-RANKING ========================================================= res_json = sorted(res_json, key=lambda i: i['lev'], reverse=True)[:10] # tempo di ricerca + re-ranking timing = "%.2f ms" % ((time.time() - start_time) * 1000) print( json.dumps( {
class Minhash(): def __init__(self,type, k = '3'): nlp = spacy.load('en_core_web_'+config.size_nlp) self.k = k self.entryname = 'Minhash_K_'+self.k config.kGRAM = self.k self.normalizer = TextPipeline(nlp) self.permutation = config.permutations self.type = type self.model = None self.path_model = "" if self.type in 'trigram': self.path_model = config.path_models + "_" + self.type self.k = '3' else: self.path_model = config.path_models + "_" + self.type + "_" + self.k self.pathDataProc = config.pathDataProc.format(self.type,self.k) def __save(self, obj, path="model"): with open(self.path_model, 'wb') as f: pickle.dump(obj, f) print("Saved: {}".format(path)) def load(self): with (open(self.path_model, "rb")) as f: self.model = pickle.load(f) def __train_LSH(self,data): start_time = time.time() forest = MinHashLSHForest(num_perm=config.permutations) for item in tqdm(data, desc="MinHash Docs.."): tag = item['tag'] tokens = item['data'] if self.type == 'trigram': tokens = self.normalizer.generate_ngrams_char(tokens[0]) m = MinHash(num_perm=config.permutations) for s in tokens: m.update(s.encode('utf8')) forest.add(tag,m) forest.index() print('It took %.2f seconds to build forest.' % (time.time() - start_time)) return forest def train(self): part = self.type print("====== TRAINING {} [ K = {} ] ...".format(part,config.kGRAM)) with open(self.pathDataProc, 'rb') as handle: data = pickle.load(handle) m_minhash = self.__train_LSH(data) self.__save(m_minhash,self.path_model) print("Model SAVED ~ {}".format(self.path_model)) print("================================") def predict(self, query, threshold=config.default_threshold, N=config.num_recommendations, Trigram = False): if self.model == None: raise Exception("Model is not loaded!") query = cleanhtml(query) if self.type != 'trigram': Trigram = False query_norm = self.normalizer.convert(query,False) tokens = self.normalizer.convert(query) else: query, query_norm = self.normalizer.get_last_trigram(query) if query_norm == None: return {'query': query, 'data': [], 'time': '0 ms', 'max': N, 'time_search': '0 ms', 'threshold': threshold} else: Trigram = True tokens = self.normalizer.generate_ngrams_char(query_norm) start_time = time.time() m = MinHash(num_perm=self.permutation) for s in tokens: m.update(s.encode('utf8')) # m e' la query sotto forma di bucket ed N e' il numero max di elementi richiesti idx_array = np.array(self.model.query(m, N)) timing_search = "%.2f ms" % ((time.time() - start_time) * 1000) if len(idx_array) == 0: res_json = [] else: res_json = [] for doc_retrival in idx_array: item = metrics.metric(query_norm, doc_retrival, self.normalizer,Trigram=Trigram) if float(item['lev']) >= threshold: res_json += [item] # ====== RE-RANKING ========================================================= res_json = sorted(res_json, key=lambda i: i['lev'], reverse=True) timing = "%.2f ms" % ((time.time() - start_time) * 1000) return {'query': query, 'data': res_json, 'time': timing, 'max':N, 'time_search':timing_search, 'threshold':threshold, 'algoritm':self.entryname}
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.neighbors import NearestNeighbors import pickle import spacy from preprocess.text_pipeline import TextPipeline nlp = spacy.load('en_core_web_sm') normalizer = TextPipeline(nlp) from utils.metrics import metric k = '1' t = 'phrase' path_data = '/home/anto/Scrivania/Tesi/testing/processed_data/' + t + '_' + k with open(path_data, 'rb') as handle: data = pickle.load(handle) X = [" ".join(item['data']) for item in data] Y = [item['tag'] for item in data] for i in range(550, 2000): print(">", Y[i]) print(">", X[i]) print("\n ==== \n") exit() vectorizer = TfidfVectorizer(ngram_range=(3, 3)) matrix = vectorizer.fit_transform(X) knn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(matrix) def predict(id):
class Processer(): def __init__(self, filepath="", part=""): self.filepath = filepath + "total_" + part + ".json" self.nlp = spacy.load('en_core_web_' + config.size_nlp) self.normalizer = TextPipeline(self.nlp) if part == "paragraph" or part == "section" or part == "trigram": if part == "trigram": # uso le frasi per l'estazione dei trigrammi self.filepath = filepath + "total_phrase.json" self.tag = part[0].upper() else: self.tag = "F" print(self.filepath) with open(self.filepath) as json_file: self.data = json.load(json_file) if self.tag == "T": print("TOTAL {}: {} (NUMERO DI FRASI)".format( part, self.data['total'])) else: print("TOTAL {}: {}".format(part, self.data['total'])) def __iter__(self): if config.DEBUG: docList = list(self.data['data'].keys())[:config.item_on_debug] else: docList = list(self.data['data'].keys()) progress_doc = 0 for docname in docList: print("Doc {}: {}/{}".format(docname, progress_doc, len(docList))) progress_doc = progress_doc + 1 items_of_doc = self.data['data'][docname] for (i, item) in enumerate(items_of_doc): data_list_normalized = self.normalizer.convert_trigram(item) for key in data_list_normalized.keys(): yield [{ 'tag': '[' + docname + '#' + self.tag + "_" + str(uuid.uuid4()) + ']' + key, 'data': data_list_normalized[key] }] def run(self): if config.DEBUG: docList = list(self.data['data'].keys())[:config.item_on_debug] else: docList = list(self.data['data'].keys()) result = [] for docname in tqdm(docList): items_of_doc = self.data['data'][docname] for (i, item) in enumerate(items_of_doc): data_list_normalized = self.normalizer.convert(item) if len(data_list_normalized) > 0: result += [{ 'tag': '[' + docname + '#' + self.tag + "_" + str(uuid.uuid4()) + ']' + item, 'data': data_list_normalized }] return result