Exemple #1
0
    def __init__(self, files, file_format=None):
        self.files = files
        self.files_opened = []
        self.file_format = file_format
        self.db_server = db_handler()

        for f in self.files:
            self.files_opened.append(OpenFile(f))

        if file_format == None:
            pass

        elif file_format == "docx":
            for i, f1 in enumerate(self.files_opened):
                for f2 in self.files_opened[i + 1:]:
                    print("# Comparing {} and {} #".format(
                        f1.location, f2.location))
                    self.compare_docx(f1, f2)
        elif file_format == "pptx":
            for i, f1 in enumerate(self.files_opened):
                for f2 in self.files_opened[i + 1:]:
                    print("# Comparing {} and {} #".format(
                        f1.location, f2.location))
                    self.compare_docx(f1, f2, pptx=True)
        elif file_format == "xlsx":
            for i, f1 in enumerate(self.files_opened):
                for f2 in self.files_opened[i + 1:]:
                    print("# Comparing {} and {} #".format(
                        f1.location, f2.location))
                    self.compare_xlsx(f1, f2, mod="diff")
                    #self.compare_xlsx(f1,f2,mod="pd")

        return
    def __init__(self,
                 path,
                 db_name=None,
                 file_type=None,
                 key=None,
                 proc="path2list2"):
        self.path = path
        self.key = key
        self.file_type = file_type
        self.db_name = db_name
        self.db_server = db_handler()

        #self.files = []
        self.docx = []
        self.xlsx = []
        self.pptx = []
        self.pdf = []
        self.img = []
        self.vid = []
        self.audio = []

        if proc == "path2list2":
            self.path2list2(self.path)
        elif proc == "list_files":
            self.list_files(self.path, self.db_name, self.key)
        return
Exemple #3
0
    def __init__(self, files, search = None):
        self.files = files
        self.files_opened = []
        #self.lang = lang
        self.search = search.lower()
        self.text = []
        self.db_server = db_handler()
        for f in self.files:
            self.files_opened.append(OpenFile(f))
        
        for i, f in enumerate(self.files):
            if f.endswith("xlsx") or f.endswith("xls") or f.endswith("ods"):
                self.text.append(str(self.files_opened[i].tables))
            else:
                self.text.append(self.files_opened[i].text)
                     
        if self.search:
            res = self.db_server.query(db_sh,["term"],query_key="_id", query_value="txt_in_txt")
            #print(type(res), res)
            res2 = []
            for row in res:
                for _ in row.key[0]:
                    res2.append(_)
            #print(type(res2), res2)
            
            res3 = set(res2)
            #print(type(res3), res3)

            
            if self.search not in res3:
                res3.add(self.search)
                self.db_server.save(db_sh,{'term' : list(res3)}, doc_id = "txt_in_txt")

            for _ in range(0,len(files)):
                self.find(_)
 def __init__(self, files, tags=None):
     self.media = files
     self.results = []
     self.times = []
     self.segmentation = []
     self.results = []
     self.tags = tags
     self.meta = []
     self.db_server = db_handler()
     self.classify()
     self.save_results()
     self.print_results()
    def __init__(self):
        self.db_server = db_handler()
        res = self.db_server.query(db_ocr, ["_id", "content"])
        self.ocr_history = {}
        for row in res:
            self.ocr_history[row.key[0]] = row.key[1]

        res = self.db_server.query(db_sh, ["_id", "term"])
        self.search_history = {}
        for row in res:
            tmp = []
            for _ in row.key[1]:
                tmp.append(_)
            self.search_history[row.key[0]] = tmp

        print(self.search_history)
Exemple #6
0
    def __init__(self, files, lang=None, search=None, file_type=None):
        self.files = files
        self.lang = lang
        self.search = search.lower()
        self.text = []
        self.db_server = db_handler()
        res = self.db_server.query(db_ocr, ["_id", "content"])
        self.ocr_history = {}
        for row in res:
            self.ocr_history[row.key[0]] = row.key[1]
        #print(self.ocr_history)

        for f in files:
            #print(pytesseract.image_to_string(Image.open(f), lang=None))
            print("# OCR for: {} #".format(f))
            if f in self.ocr_history.keys():
                self.text.append(self.ocr_history[f])

            elif file_type == "pdf":
                doc = fitz.open(f)
                fontlist = doc.getPageFontList(0)
                if fontlist == []:
                    imgs = self.pdf2img(f)
                    tmp2 = ""
                    for img in imgs:
                        ocv_img = self.img_preprocess(img)
                        tmp = str(
                            pytesseract.image_to_string(ocv_img,
                                                        lang=self.lang))
                        tmp2 += tmp
                    self.text.append(tmp2)

                else:
                    tmp = textract.process(f, encoding='utf-8')
                    self.text.append(tmp)

                self.db_server.save(db_ocr, {'content': tmp}, doc_id=f)

            else:
                pil_img = Image.open(f)
                ocv_img = cv2.imread(f)

                ocv_img = self.img_preprocess(ocv_img)

                tmp = pytesseract.image_to_string(ocv_img, lang=self.lang)
                self.text.append(tmp)
                #print(tmp)
                self.db_server.save(db_ocr, {'content': tmp}, doc_id=f)

        if self.search:
            res = self.db_server.query(db_sh, ["term"],
                                       query_key="_id",
                                       query_value="txt_in_img")
            #print(type(res), res)
            res2 = []
            for row in res:
                for _ in row.key[0]:
                    res2.append(_)
            #print(type(res2), res2)

            res3 = set(res2)
            #print(type(res3), res3)

            if self.search not in res3:
                res3.add(self.search)
                self.db_server.save(db_sh, {'term': list(res3)},
                                    doc_id="txt_in_img")

            for _ in range(0, len(files)):
                self.find(_)
import os
import hashlib
import sys
import time
import logging
from watchdog.observers import Observer
from watchdog.events import LoggingEventHandler, FileSystemEventHandler
from anytree import Node, RenderTree
from db_handler import *
from ocr import OCR
from auto_classifier import AUTO_CLASSIFIER

db_server = db_handler()
file_hashes = []
folder_hashes = []


def list_files_hashes():
    pass


def save_file(name, path):
    st = os.stat(path)
    try:
        import pwd  # not available on all platforms
        userinfo = pwd.getpwuid(st.st_uid)
    except (ImportError, KeyError):
        print("failed to get the owner name for", f)
    else:
        userinfo = "[ERROR] UNKOWN"
        #print("file {}, owned by: {}".format(f, userinfo[0]))
Exemple #8
0
    def __init__(self, files, templates, file_type=None):
        self.files = files
        self.templates = templates
        self.results = []
        self.db_server = db_handler()

        res = self.db_server.query(db_sh, ["term"],
                                   query_key="_id",
                                   query_value="img_in_img")
        #print(type(res), res)
        res2 = []
        for row in res:
            for _ in row.key[0]:
                res2.append(_)
        #print(type(res2), res2)

        res3 = set(res2)
        #print(type(res3), res3)

        for tmplt in templates:
            if tmplt not in res3:
                res3.add(tmplt)
                self.db_server.save(db_sh, {'term': list(res3)},
                                    doc_id="img_in_img")

        # Initiate SIFT detector
        self.sift = sift = cv2.xfeatures2d.SIFT_create()

        for f in self.files:
            if file_type == "pdf":
                imgs = self.pdf2img(f)
            else:
                img_t = cv2.imread(f)  # trainImage
            for tmplt in self.templates:
                img_q = cv2.imread(tmplt)  # queryImage
                good = []
                print("# searching for {} in {}".format(tmplt, f))
                if file_type == "pdf":
                    tmp2 = ""
                    for p_img in imgs:
                        img_t = p_img  # trainImage
                        matches = self.sift_run(img_q, img_t)
                        # ratio test as per Lowe's paper
                        for m, n in matches:
                            if m.distance < 0.5 * n.distance:
                                good.append([m])

                else:
                    matches = self.sift_run(img_q, img_t)
                    # Apply ratio test as per Lowe's paper
                    for m, n in matches:
                        if m.distance < 0.5 * n.distance:
                            good.append([m])

                if good != []:
                    db_res = self.db_server.query(db_ic_i, ["class"],
                                                  query_key="_id",
                                                  query_value=f)
                    #print(type(db_res), db_res)
                    db_res2 = []
                    for row in db_res:
                        for _ in row.key[0]:
                            db_res2.append(_)
                    #print(type(db_res2), db_res2)

                    db_res3 = set(db_res2)
                    #print(type(db_res3), db_res3)
                    if tmplt not in db_res3:
                        db_res3.add(tmplt)
                        self.db_server.save(db_ic_i, {'class': list(db_res3)},
                                            doc_id=f)
Exemple #9
0
config.read('bot.ini')
print('bot.ini loaded.')

#Convert config to simple dict for ease of use:
strings = {}
for section in config.sections():
    for tup in config.items(section):
        strings.update({tup[0]: tup[1]})
b = datetime.datetime.now()
print('\tbot.ini integrated in bot.')

with open('moves.json', 'r') as f:
    dic = json.load(f)
print('Moves loaded.')

db_handler = db_handler()

counter = counter.counter(datetime.datetime.now())
initialized = None

#--------------Events ------------------


@bot.event
async def on_ready():
    """Lets Tim know the bot loaded properly"""

    b = datetime.datetime.now()
    s = 'Bot is initialized after ' + str(str((b - a).seconds))
    s += '.' + str((b - a).microseconds)[3:] + 's.'
    print(s)
Exemple #10
0
    def __init__(self, files, templates, file_type = None):
        self.files = files
        self.templates = templates
        self.results = []
        self.db_server = db_handler()

        res = self.db_server.query(db_sh,["term"],query_key="_id", query_value="img_in_img")
        #print(type(res), res)
        res2 = []
        for row in res:
                print(row)
                #for _ in row.key[0]:
                for _ in row['term']:
                    res2.append(_)
            #print(type(res2), res2)
            
        res3 = set(res2)
            #print(type(res3), res3)

        for tmplt in templates:
            if tmplt not in res3:
                    res3.add(tmplt)
                    self.db_server.save(db_sh,{'term' : list(res3)}, doc_id = "img_in_img")

        # Initiate SIFT detector
        self.sift = cv2.xfeatures2d.SIFT_create()

        # Initiate SURF detector
        self.surf = cv2.xfeatures2d.SURF_create(400) # Hessian Threshold to 300-500

        # Initiate BFMatcher
        self.bf = cv2.BFMatcher(normType=cv2.NORM_L2, crossCheck=False)

        self.algo = "surf"

        for f in self.files:
            if file_type == "pdf":
                imgs = self.pdf2img(f)
            else:
                img_t = cv2.imread(f) # trainImage
            for tmplt in self.templates:
                img_q = cv2.imread(tmplt) # queryImage
                good = []

                # get descriptors of query image
                kps_q, descs_q = self.get_desc(img_q, self.algo)

                print("# searching for {} in {}".format(tmplt, f))
                if file_type == "pdf" and imgs != []:
                    for p_img in imgs:
                        img_t = p_img # trainImage

                        kps_t, descs_t = self.get_desc(img_t, self.algo)

                        if descs_t is not None:
                            matches = self.get_matches(descs_q, descs_t)
   
                            # ratio test as per Lowe's paper
                            if matches is not None:
                                for m,n in matches:
                                    if m.distance < 0.5*n.distance:
                                        good.append([m])
                    
                else:    
                    kps_t, descs_t = self.get_desc(img_t, self.algo)

                    if descs_t is not None:
                        matches = self.get_matches(descs_q, descs_t)
                        # ratio test as per Lowe's paper
                        if matches is not None:
                            for m,n in matches:
                                if m.distance < 0.5*n.distance:
                                    good.append([m])

                if good != []:
                    db_res = self.db_server.query(db_ic_i,["class"],query_key="_id", query_value=f)
                    #print(type(db_res), db_res)
                    db_res2 = []
                    for row in db_res:
                        for _ in row['class']:
                            db_res2.append(_)
                    #print(type(db_res2), db_res2)
                    
                    db_res3 = set(db_res2)
                    #print(type(db_res3), db_res3)
                    if tmplt not in db_res3:
                        db_res3.add(tmplt)
                        self.db_server.save(db_ic_i,{'class' : list(db_res3)}, doc_id = f)
Exemple #11
0
    def __init__(self, files, file_format, method=None):

        self.files = files
        self.files_opened = []
        for f in self.files:
            self.files_opened.append(OpenFile(f))
        self.docLabels = []
        self.db_server = db_handler()
        for doc in self.files_opened:
            self.docLabels.append(doc.location)

        self.algo = "dbscan"

        # create a list data that stores the content of all text files in order of their names in docLabels
        data = []
        if file_format == "docx" or file_format == "pptx":
            for doc in self.files_opened:
                #data.append(open(doc, encoding='latin-1').read())
                db = db_ds
                data.append(doc.text)
        elif file_format == "xlsx":
            for i, doc in enumerate(self.files_opened):
                #data.append(open(doc, encoding='latin-1').read())
                db = db_xs
                try:
                    data.append(json.dumps(doc.tables, skipkeys=True))
                except:
                    print("error parsing document {}".format(
                        self.docLabels[i]))
                    data.append("")

        data = nlp_clean(data)
        if method == "fuzzywuzzy":
            for i, f1 in enumerate(data):
                for f2 in data[i + 1:]:
                    # print(self.docLabels[i],self.docLabels[i+1])
                    x = fuzz.ratio(f1, f2)
                    y = fuzz.partial_ratio(f1, f2)
                    print(
                        "overall similarity ration: {} %\npartial similarity ration: {}"
                        .format(x, y))
                    db_data = {
                        'dok_id': {
                            'dok_1': self.docLabels[i],
                            'dok_2': self.docLabels[i + 1]
                        },
                        'kullanici': user_default,
                        'overall similarity ratio': x,
                        'partial similarity ratio': y
                    }
                    self.db_server.save(db,
                                        db_data,
                                        doc_id=self.docLabels[i] + "_" +
                                        self.docLabels[i + 1])

        elif method == "inference":
            #res = self.db_server.query(db_gensim,["_attachments"],query_key="_id", query_value=file_format)

            #model_loc ="{}gensim_models/docx/models/doc2vec_{}.model".format(server_default,file_format)
            model_loc = "models/doc2vec_{}.model".format(file_format)
            # loading the model
            d2v_model = gensim.models.doc2vec.Doc2Vec.load(model_loc)
            # d2v_model.init_sims(replace=False)

            # infer_vector is non-deterministic; i.e. the resulting vector is different each time, but it should be similar enough with a good model
            infervec = d2v_model.infer_vector(data[0],
                                              alpha=0.025,
                                              min_alpha=0.025,
                                              steps=300)
            similar_doc = d2v_model.docvecs.most_similar([infervec])
            most_similar = similar_doc[0][0]
            print(type(most_similar))
            print("most similar: {}".format(most_similar))

            #db_res = self.db_server.query(db_dc,["_id","docs"])
            db_res = self.db_server.query(db_dc, ["docs", "clusters"],
                                          query_key="_id",
                                          query_value=file_format)
            print(db_res)
            db_res_a = []
            db_res_b = []
            for row in db_res:
                # db_res_a.append(row)
                for a in row.key[0]:
                    db_res_a.append(a)
                for b in row.key[1]:
                    db_res_b.append(b)
            # print(db_res_a)
            # print(db_res_b)
            most_similar_class = db_res_b[db_res_a.index(most_similar)]
            print("most likely class: {}".format(most_similar_class))
            print("other documents in same category")
            for i in range(len(db_res_b)):
                if db_res_b[i] == most_similar_class:
                    print(db_res_a[i])

        else:
            # iterator returned over all documents
            it = LabeledLineSentence(data, self.docLabels)
            model = gensim.models.Doc2Vec(vector_size=300,
                                          min_count=0,
                                          alpha=0.025,
                                          min_alpha=0.025)
            model.build_vocab(it)
            # training of model
            for epoch in range(100):
                #print ('iteration '+str(epoch+1))
                model.train(it, total_examples=model.corpus_count, epochs=3)
                model.alpha -= 0.002
                model.min_alpha = model.alpha

            model.save('models/doc2vec_{}.model'.format(file_format))

            db_g = db_gensim
            db_data = {"time": "time", "path": dataset_path}
            self.db_server.save(
                db_g,
                db_data,
                doc_id=file_format,
                attachment='models/doc2vec_{}.model'.format(file_format))

            print("model saved")

            # loading the model
            d2v_model = gensim.models.doc2vec.Doc2Vec.load(
                'models/doc2vec_{}.model'.format(file_format))

            # start testing
            X = []
            # printing the vector of documents in docLabels
            for i, _ in enumerate(self.docLabels):
                docvec = d2v_model.docvecs[i]
                # print(docvec)
                X.append(docvec)
            X = np.array(X)
            #docvec = d2v_model.docvecs[0]
            #print (docvec)
            #docvec = d2v_model.docvecs[1]
            #print (docvec)

            # to get most similar document with similarity scores using document-index
            #similar_doc = d2v_model.docvecs.most_similar(0)
            # print(similar_doc)

            # for doc in similar_doc:
            #    db_data = {'dok_id' : {'dok_1' : self.docLabels[0],'dok_2' : doc[0]}, 'kullanici': user_default, 'benzerlik orani': str(doc[1])}
            #    self.db_server.save(db, db_data)
            #similar_doc = d2v_model.docvecs.most_similar(1)
            # print(similar_doc)

            # printing the vector of the file using its name
            # docvec = d2v_model.docvecs['shakespeare-hamlet.txt'] #if string tag used in training
            # print(docvec)
            # to get most similar document with similarity scores using document- name
            #sims = d2v_model.docvecs.most_similar('shakespeare-hamlet.txt')
            # print(sims)

            # #############################################################################
            # Compute Affinity

            if self.algo == "aff":
                af = AffinityPropagation(preference=-50).fit(X)
                cluster_centers_indices = af.cluster_centers_indices_
                n_clusters_ = len(cluster_centers_indices)
                labels = af.labels_
            elif self.algo == "dbscan":  #trying DBScan instead
                X = StandardScaler().fit_transform(X)
                af = DBSCAN(eps=3, min_samples=2).fit(X)
                core_samples_mask = np.zeros_like(af.labels_, dtype=bool)
                core_samples_mask[af.core_sample_indices_] = True

                labels = af.labels_
                unique_labels = set(labels)
                n_clusters_ = len(unique_labels)

            #labels2 = []
            # for i, lb in enumerate(labels):
            #    labels2.append(self.files[i].split('/')[-1])
            #print("labels: {}".format(labels))
            #print("labels2: {}".format(labels2))

            print("number of clusters: {}".format(n_clusters_))
            dic = {i: np.where(labels == i)[0] for i in range(n_clusters_)}
            dic2 = {}
            # print(dic)

            for key, value in dic.items():
                print("cluster {}:".format(key))
                for e in value:
                    print("{} : {}".format(e, self.files[e].split('/')[-1]))
                    dic2[self.docLabels[e]] = key

            print(dic2)

            # print('Estimated number of clusters: %d' % n_clusters_)
            # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
            # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
            # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
            # print("Adjusted Rand Index: %0.3f"
            #     % metrics.adjusted_rand_score(labels_true, labels))
            # print("Adjusted Mutual Information: %0.3f"
            #     % metrics.adjusted_mutual_info_score(labels_true, labels))
            #print("Silhouette Coefficient: %0.3f"
            #      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

            # #############################################################################
            # Plot result
            import matplotlib.pyplot as plt
            from mpl_toolkits.mplot3d import Axes3D
            from itertools import cycle

            plt.close('all')
            plt.figure(figsize=(25, 10))
            plt.clf()

            # reduce dimensions
            # pca = PCA(n_components=2)
            # reduced = pca.fit_transform(X)
            # X = reduced

            if self.algo == "aff":
                colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
                for k, col in zip(range(n_clusters_), colors):
                    class_members = labels == k
                    cluster_center = X[cluster_centers_indices[k]]
                    plt.plot(X[class_members, 0], X[class_members, 1],
                             col + '.')
                    plt.plot(cluster_center[0],
                             cluster_center[1],
                             'o',
                             markerfacecolor=col,
                             markeredgecolor='k',
                             markersize=5)
                    for x in X[class_members]:
                        plt.plot([cluster_center[0], x[0]],
                                 [cluster_center[1], x[1]], col)

                plt.title(
                    'Clustering with Affinity Propagation | Estimated number of clusters: %d'
                    % n_clusters_)
                plt.savefig(
                    'models/{}_affinity_clusters.png'.format(file_format),
                    dpi=300)
            elif self.algo == "dbscan":
                colors = [
                    plt.cm.Spectral(each)
                    for each in np.linspace(0, 1, len(unique_labels))
                ]
                for k, col in zip(unique_labels, colors):
                    if k == -1:
                        # Black used for noise.
                        col = [0, 0, 0, 1]

                    class_member_mask = (labels == k)

                    xy = X[class_member_mask & core_samples_mask]
                    plt.plot(xy[:, 0],
                             xy[:, 1],
                             'o',
                             markerfacecolor=tuple(col),
                             markeredgecolor='k',
                             markersize=14)

                    xy = X[class_member_mask & ~core_samples_mask]
                    plt.plot(xy[:, 0],
                             xy[:, 1],
                             'o',
                             markerfacecolor=tuple(col),
                             markeredgecolor='k',
                             markersize=6)

                plt.title(
                    'Clustering with DBScan | Estimated number of clusters: %d'
                    % n_clusters_)
                plt.savefig(
                    'models/{}_dbscan_clusters.png'.format(file_format),
                    dpi=300)

            plt.show()

            #db = db_dc
            db_data = dic2
            db_data["docs"] = self.docLabels
            db_data["clusters"] = labels.tolist()
            self.db_server.save(
                db_dc,
                db_data,
                doc_id=file_format,
                attachment='models/{}_affinity_clusters.png'.format(
                    file_format))

            # #########################
            # hierarchical

            linkage_matrix = []
            #linkage_matrix.append(linkage(X, method='single', metric='euclidean'))
            linkage_matrix.append(
                linkage(X, method='average', metric='euclidean'))
            #linkage_matrix.append(linkage(X, method='complete', metric='euclidean'))
            #linkage_matrix.append(linkage(X, method='ward', metric='euclidean'))

            #linkage_matrix.append(linkage(X, method='single', metric='seuclidean'))
            # linkage_matrix.append(linkage(X, method='average', metric='seuclidean'))
            #linkage_matrix.append(linkage(X, method='complete', metric='seuclidean'))

            for n, l in enumerate(linkage_matrix):
                # calculate full dendrogram
                plt.figure(figsize=(25, 10))
                plt.title('Hierarchical Clustering Dendrogram')
                plt.ylabel('word')
                plt.xlabel('distance')

                dendrogram(
                    l,
                    leaf_rotation=0.,  # rotates the x axis labels
                    leaf_font_size=16.,  # font size for the x axis labels
                    orientation='left',
                    leaf_label_func=lambda v: str(self.files[v].split('/')[-1])
                )
                # plt.savefig('clusters_{}.png'.format(n), dpi=200) #save figure as ward_clusters
                plt.savefig(
                    'models/{}_hierarchical_clusters.png'.format(file_format),
                    dpi=300)
                plt.show()

                db_data = {}
                self.db_server.save(
                    db_dc,
                    db_data,
                    doc_id=file_format,
                    attachment='models/{}_hierarchical_clusters.png'.format(
                        file_format))
 def clear_db(self):
     self.statusBar().showMessage('Clearing Database')
     db_server = db_handler()
     db_server.delete_all()
     self.statusBar().showMessage('Database Clearing Done')