Exemple #1
0
    def __init__(self, model_type,part_type,ngram = 3):
        ngram_range = (ngram,ngram)
        self.part_type = part_type
        self.model_type = model_type
        if model_type == 'count':
            self.vectorizer = CountVectorizer(ngram_range=ngram_range)
        elif model_type == 'tfidf':
            self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        else:
            print('model is not valid')
            exit(1)

        self.knn = None
        self.nlp = spacy.load('en_core_web_sm')
        self.normalizer = TextPipeline(self.nlp)

        path_data = config.path_data + self.part_type + '_3'
        with open(path_data, 'rb') as handle:
            data = pickle.load(handle)

        # vectorizer prende in input una stringa e non una lista di token
        #self.data_train = [ item['normalized'] for item in data]
        self.labels = [item['tag'] for item in data]

        self.path_knn = config.path_knn.format(model_type,part_type,ngram)
        self.path_vec = config.path_model.format(model_type,part_type,ngram)
        self.entryname = "{}_{}_{}".format(str.upper(model_type),part_type,ngram)
        print(self.path_knn)
        print(self.path_vec)
    def __init__(self,type, k = '3'):
        nlp = spacy.load('en_core_web_'+config.size_nlp)
        self.k = k
        self.entryname = 'Minhash_K_'+self.k
        config.kGRAM = self.k
        self.normalizer = TextPipeline(nlp)
        self.permutation = config.permutations
        self.type = type
        self.model = None
        self.path_model = ""
        if self.type in 'trigram':
            self.path_model = config.path_models + "_" + self.type
            self.k = '3'
        else:
            self.path_model = config.path_models + "_" + self.type + "_" + self.k

        self.pathDataProc = config.pathDataProc.format(self.type,self.k)
Exemple #3
0
    def __init__(self, filepath="", part=""):
        self.filepath = filepath + "total_" + part + ".json"
        self.nlp = spacy.load('en_core_web_' + config.size_nlp)
        self.normalizer = TextPipeline(self.nlp)

        if part == "paragraph" or part == "section" or part == "trigram":
            if part == "trigram":
                # uso le frasi per l'estazione dei trigrammi
                self.filepath = filepath + "total_phrase.json"
            self.tag = part[0].upper()
        else:
            self.tag = "F"

        print(self.filepath)
        with open(self.filepath) as json_file:
            self.data = json.load(json_file)

        if self.tag == "T":
            print("TOTAL {}: {} (NUMERO DI FRASI)".format(
                part, self.data['total']))
        else:
            print("TOTAL {}: {}".format(part, self.data['total']))
Exemple #4
0
class TextVectorizer:
    def __init__(self, model_type,part_type,ngram = 3):
        ngram_range = (ngram,ngram)
        self.part_type = part_type
        self.model_type = model_type
        if model_type == 'count':
            self.vectorizer = CountVectorizer(ngram_range=ngram_range)
        elif model_type == 'tfidf':
            self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        else:
            print('model is not valid')
            exit(1)

        self.knn = None
        self.nlp = spacy.load('en_core_web_sm')
        self.normalizer = TextPipeline(self.nlp)

        path_data = config.path_data + self.part_type + '_3'
        with open(path_data, 'rb') as handle:
            data = pickle.load(handle)

        # vectorizer prende in input una stringa e non una lista di token
        #self.data_train = [ item['normalized'] for item in data]
        self.labels = [item['tag'] for item in data]

        self.path_knn = config.path_knn.format(model_type,part_type,ngram)
        self.path_vec = config.path_model.format(model_type,part_type,ngram)
        self.entryname = "{}_{}_{}".format(str.upper(model_type),part_type,ngram)
        print(self.path_knn)
        print(self.path_vec)

    def save(self):
        with open(self.path_knn, 'wb') as handle:
            pickle.dump(self.knn, handle)

        with open(self.path_vec, 'wb') as handle:
            pickle.dump(self.vectorizer, handle)



    def load(self):
        with open(self.path_knn, 'rb') as handle:
            self.knn = pickle.load(handle)

        with open(self.path_vec, 'rb') as handle:
            self.vectorizer = pickle.load(handle)

        self.l_sign = len(self.vectorizer.get_feature_names())
        print(self.vectorizer)

    def train(self):
        path_data = config.path_data + self.part_type + '_3'
        with open(path_data, 'rb') as handle:
            data = pickle.load(handle)

        # vectorizer prende in input una stringa e non una lista di token
        self.data_train = [item['normalized'] for item in data]

        print("==== TRAINING [ model = {}, part_type = {} ]".format(self.model_type,self.part_type))
        matrix = self.vectorizer.fit_transform(self.data_train)
        self.l_sign = len(self.vectorizer.get_feature_names())
        self.knn = NearestNeighbors(n_neighbors=config.num_recommendations, metric='cosine').fit(matrix)
        self.save()
        print("==== END ====================")

        pass

    def predict(self,
                query,
                threshold=config.default_threshold,
                N=config.num_recommendations,
                Trigram = False):

        start_time = time.time()

        query_norm = self.normalizer.convert(query, divNGram=False)
        query_vec = self.vectorizer.transform([query_norm])

        dist, idx = self.knn.kneighbors(query_vec)
        idx = idx[0]

        timing_search = "%.2f ms" % ((time.time() - start_time) * 1000)

        if len(idx) == 0:
            res_json = []
        else:
            res_json = []
            for id in idx:
                item = metric(query_norm, self.labels[id], self.normalizer, Trigram=Trigram)
                if float(item['lev']) >= threshold:
                    res_json += [item]
            # ====== RE-RANKING =========================================================
            res_json = sorted(res_json, key=lambda i: i['lev'], reverse=True)

        timing = "%.2f ms" % ((time.time() - start_time) * 1000)

        return {'query': query, 'data': res_json, 'time': timing, 'max': N, 'time_search': timing_search,
                'threshold': threshold, 'algoritm': self.entryname}
Exemple #5
0
    start_time = time.time()
    s1_hash = Simhash(s1, f=SGN_L)
    results = index_lsh.get_near_dups(s1_hash)
    timing_search = "%.2f ms" % ((time.time() - start_time) * 1000)

    if len(results) == 0:
        res_json = []

    else:
        print('\n', timing_search)
        print('#Res: ', len(results))
        import spacy
        from preprocess.text_pipeline import TextPipeline
        nlp = spacy.load('en_core_web_sm')
        normalizer = TextPipeline(nlp)
        query_norm = normalizer.convert(query, divNGram=False)
        res_json = []
        for doc_retrival in results:
            item = metric(query_norm, doc_retrival, normalizer, Trigram=False)
            if float(item['lev']) >= 0.3:
                res_json += [item]
        # ====== RE-RANKING =========================================================
        res_json = sorted(res_json, key=lambda i: i['lev'], reverse=True)[:10]

    # tempo di ricerca + re-ranking
    timing = "%.2f ms" % ((time.time() - start_time) * 1000)

    print(
        json.dumps(
            {
class Minhash():
    def __init__(self,type, k = '3'):
        nlp = spacy.load('en_core_web_'+config.size_nlp)
        self.k = k
        self.entryname = 'Minhash_K_'+self.k
        config.kGRAM = self.k
        self.normalizer = TextPipeline(nlp)
        self.permutation = config.permutations
        self.type = type
        self.model = None
        self.path_model = ""
        if self.type in 'trigram':
            self.path_model = config.path_models + "_" + self.type
            self.k = '3'
        else:
            self.path_model = config.path_models + "_" + self.type + "_" + self.k

        self.pathDataProc = config.pathDataProc.format(self.type,self.k)


    def __save(self, obj, path="model"):
        with open(self.path_model, 'wb') as f:
            pickle.dump(obj, f)
        print("Saved: {}".format(path))

    def load(self):
        with (open(self.path_model, "rb")) as f:
            self.model = pickle.load(f)


    def __train_LSH(self,data):
        start_time = time.time()
        forest = MinHashLSHForest(num_perm=config.permutations)
        for item in tqdm(data, desc="MinHash Docs.."):
            tag = item['tag']
            tokens = item['data']

            if self.type == 'trigram':
                tokens = self.normalizer.generate_ngrams_char(tokens[0])
            m = MinHash(num_perm=config.permutations)
            for s in tokens:
                m.update(s.encode('utf8'))
            forest.add(tag,m)

        forest.index()
        print('It took %.2f seconds to build forest.' % (time.time() - start_time))
        return forest


    def train(self):
        part = self.type
        print("====== TRAINING {} [ K = {} ] ...".format(part,config.kGRAM))
        with open(self.pathDataProc, 'rb') as handle:
            data = pickle.load(handle)
            m_minhash = self.__train_LSH(data)

        self.__save(m_minhash,self.path_model)
        print("Model SAVED ~ {}".format(self.path_model))
        print("================================")


    def predict(self,
                query,
                threshold=config.default_threshold,
                N=config.num_recommendations,
                Trigram = False):
        if self.model == None:
            raise Exception("Model is not loaded!")

        query = cleanhtml(query)

        if self.type != 'trigram':
            Trigram = False
            query_norm = self.normalizer.convert(query,False)
            tokens = self.normalizer.convert(query)
        else:
            query, query_norm = self.normalizer.get_last_trigram(query)
            if query_norm == None:
                return {'query': query,
                        'data': [],
                        'time': '0 ms',
                        'max': N,
                        'time_search': '0 ms',
                        'threshold': threshold}
            else:
                Trigram = True
                tokens = self.normalizer.generate_ngrams_char(query_norm)

        start_time = time.time()
        m = MinHash(num_perm=self.permutation)
        for s in tokens:
            m.update(s.encode('utf8'))

        # m e' la query sotto forma di bucket ed N e' il numero max di elementi richiesti
        idx_array = np.array(self.model.query(m, N))

        timing_search = "%.2f ms" % ((time.time() - start_time) * 1000)

        if len(idx_array) == 0:
            res_json = []
        else:

            res_json = []
            for doc_retrival in idx_array:
                item = metrics.metric(query_norm, doc_retrival, self.normalizer,Trigram=Trigram)
                if float(item['lev']) >= threshold:
                    res_json += [item]
            # ====== RE-RANKING =========================================================
            res_json = sorted(res_json, key=lambda i: i['lev'], reverse=True)

        timing = "%.2f ms" % ((time.time() - start_time) * 1000)
        return {'query': query, 'data': res_json, 'time': timing, 'max':N, 'time_search':timing_search, 'threshold':threshold, 'algoritm':self.entryname}
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
import spacy
from preprocess.text_pipeline import TextPipeline

nlp = spacy.load('en_core_web_sm')
normalizer = TextPipeline(nlp)

from utils.metrics import metric

k = '1'
t = 'phrase'
path_data = '/home/anto/Scrivania/Tesi/testing/processed_data/' + t + '_' + k
with open(path_data, 'rb') as handle:
    data = pickle.load(handle)

X = [" ".join(item['data']) for item in data]
Y = [item['tag'] for item in data]
for i in range(550, 2000):
    print(">", Y[i])
    print(">", X[i])
    print("\n ==== \n")
exit()

vectorizer = TfidfVectorizer(ngram_range=(3, 3))
matrix = vectorizer.fit_transform(X)
knn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(matrix)


def predict(id):
Exemple #8
0
class Processer():
    def __init__(self, filepath="", part=""):
        self.filepath = filepath + "total_" + part + ".json"
        self.nlp = spacy.load('en_core_web_' + config.size_nlp)
        self.normalizer = TextPipeline(self.nlp)

        if part == "paragraph" or part == "section" or part == "trigram":
            if part == "trigram":
                # uso le frasi per l'estazione dei trigrammi
                self.filepath = filepath + "total_phrase.json"
            self.tag = part[0].upper()
        else:
            self.tag = "F"

        print(self.filepath)
        with open(self.filepath) as json_file:
            self.data = json.load(json_file)

        if self.tag == "T":
            print("TOTAL {}: {} (NUMERO DI FRASI)".format(
                part, self.data['total']))
        else:
            print("TOTAL {}: {}".format(part, self.data['total']))

    def __iter__(self):
        if config.DEBUG:
            docList = list(self.data['data'].keys())[:config.item_on_debug]
        else:
            docList = list(self.data['data'].keys())

        progress_doc = 0
        for docname in docList:
            print("Doc {}: {}/{}".format(docname, progress_doc, len(docList)))
            progress_doc = progress_doc + 1
            items_of_doc = self.data['data'][docname]
            for (i, item) in enumerate(items_of_doc):
                data_list_normalized = self.normalizer.convert_trigram(item)

                for key in data_list_normalized.keys():
                    yield [{
                        'tag':
                        '[' + docname + '#' + self.tag + "_" +
                        str(uuid.uuid4()) + ']' + key,
                        'data':
                        data_list_normalized[key]
                    }]

    def run(self):
        if config.DEBUG:
            docList = list(self.data['data'].keys())[:config.item_on_debug]
        else:
            docList = list(self.data['data'].keys())

        result = []
        for docname in tqdm(docList):
            items_of_doc = self.data['data'][docname]
            for (i, item) in enumerate(items_of_doc):
                data_list_normalized = self.normalizer.convert(item)
                if len(data_list_normalized) > 0:
                    result += [{
                        'tag':
                        '[' + docname + '#' + self.tag + "_" +
                        str(uuid.uuid4()) + ']' + item,
                        'data':
                        data_list_normalized
                    }]
        return result