def __init__(self, files, file_format=None): self.files = files self.files_opened = [] self.file_format = file_format self.db_server = db_handler() for f in self.files: self.files_opened.append(OpenFile(f)) if file_format == None: pass elif file_format == "docx": for i, f1 in enumerate(self.files_opened): for f2 in self.files_opened[i + 1:]: print("# Comparing {} and {} #".format( f1.location, f2.location)) self.compare_docx(f1, f2) elif file_format == "pptx": for i, f1 in enumerate(self.files_opened): for f2 in self.files_opened[i + 1:]: print("# Comparing {} and {} #".format( f1.location, f2.location)) self.compare_docx(f1, f2, pptx=True) elif file_format == "xlsx": for i, f1 in enumerate(self.files_opened): for f2 in self.files_opened[i + 1:]: print("# Comparing {} and {} #".format( f1.location, f2.location)) self.compare_xlsx(f1, f2, mod="diff") #self.compare_xlsx(f1,f2,mod="pd") return
def __init__(self, path, db_name=None, file_type=None, key=None, proc="path2list2"): self.path = path self.key = key self.file_type = file_type self.db_name = db_name self.db_server = db_handler() #self.files = [] self.docx = [] self.xlsx = [] self.pptx = [] self.pdf = [] self.img = [] self.vid = [] self.audio = [] if proc == "path2list2": self.path2list2(self.path) elif proc == "list_files": self.list_files(self.path, self.db_name, self.key) return
def __init__(self, files, search = None): self.files = files self.files_opened = [] #self.lang = lang self.search = search.lower() self.text = [] self.db_server = db_handler() for f in self.files: self.files_opened.append(OpenFile(f)) for i, f in enumerate(self.files): if f.endswith("xlsx") or f.endswith("xls") or f.endswith("ods"): self.text.append(str(self.files_opened[i].tables)) else: self.text.append(self.files_opened[i].text) if self.search: res = self.db_server.query(db_sh,["term"],query_key="_id", query_value="txt_in_txt") #print(type(res), res) res2 = [] for row in res: for _ in row.key[0]: res2.append(_) #print(type(res2), res2) res3 = set(res2) #print(type(res3), res3) if self.search not in res3: res3.add(self.search) self.db_server.save(db_sh,{'term' : list(res3)}, doc_id = "txt_in_txt") for _ in range(0,len(files)): self.find(_)
def __init__(self, files, tags=None): self.media = files self.results = [] self.times = [] self.segmentation = [] self.results = [] self.tags = tags self.meta = [] self.db_server = db_handler() self.classify() self.save_results() self.print_results()
def __init__(self): self.db_server = db_handler() res = self.db_server.query(db_ocr, ["_id", "content"]) self.ocr_history = {} for row in res: self.ocr_history[row.key[0]] = row.key[1] res = self.db_server.query(db_sh, ["_id", "term"]) self.search_history = {} for row in res: tmp = [] for _ in row.key[1]: tmp.append(_) self.search_history[row.key[0]] = tmp print(self.search_history)
def __init__(self, files, lang=None, search=None, file_type=None): self.files = files self.lang = lang self.search = search.lower() self.text = [] self.db_server = db_handler() res = self.db_server.query(db_ocr, ["_id", "content"]) self.ocr_history = {} for row in res: self.ocr_history[row.key[0]] = row.key[1] #print(self.ocr_history) for f in files: #print(pytesseract.image_to_string(Image.open(f), lang=None)) print("# OCR for: {} #".format(f)) if f in self.ocr_history.keys(): self.text.append(self.ocr_history[f]) elif file_type == "pdf": doc = fitz.open(f) fontlist = doc.getPageFontList(0) if fontlist == []: imgs = self.pdf2img(f) tmp2 = "" for img in imgs: ocv_img = self.img_preprocess(img) tmp = str( pytesseract.image_to_string(ocv_img, lang=self.lang)) tmp2 += tmp self.text.append(tmp2) else: tmp = textract.process(f, encoding='utf-8') self.text.append(tmp) self.db_server.save(db_ocr, {'content': tmp}, doc_id=f) else: pil_img = Image.open(f) ocv_img = cv2.imread(f) ocv_img = self.img_preprocess(ocv_img) tmp = pytesseract.image_to_string(ocv_img, lang=self.lang) self.text.append(tmp) #print(tmp) self.db_server.save(db_ocr, {'content': tmp}, doc_id=f) if self.search: res = self.db_server.query(db_sh, ["term"], query_key="_id", query_value="txt_in_img") #print(type(res), res) res2 = [] for row in res: for _ in row.key[0]: res2.append(_) #print(type(res2), res2) res3 = set(res2) #print(type(res3), res3) if self.search not in res3: res3.add(self.search) self.db_server.save(db_sh, {'term': list(res3)}, doc_id="txt_in_img") for _ in range(0, len(files)): self.find(_)
import os import hashlib import sys import time import logging from watchdog.observers import Observer from watchdog.events import LoggingEventHandler, FileSystemEventHandler from anytree import Node, RenderTree from db_handler import * from ocr import OCR from auto_classifier import AUTO_CLASSIFIER db_server = db_handler() file_hashes = [] folder_hashes = [] def list_files_hashes(): pass def save_file(name, path): st = os.stat(path) try: import pwd # not available on all platforms userinfo = pwd.getpwuid(st.st_uid) except (ImportError, KeyError): print("failed to get the owner name for", f) else: userinfo = "[ERROR] UNKOWN" #print("file {}, owned by: {}".format(f, userinfo[0]))
def __init__(self, files, templates, file_type=None): self.files = files self.templates = templates self.results = [] self.db_server = db_handler() res = self.db_server.query(db_sh, ["term"], query_key="_id", query_value="img_in_img") #print(type(res), res) res2 = [] for row in res: for _ in row.key[0]: res2.append(_) #print(type(res2), res2) res3 = set(res2) #print(type(res3), res3) for tmplt in templates: if tmplt not in res3: res3.add(tmplt) self.db_server.save(db_sh, {'term': list(res3)}, doc_id="img_in_img") # Initiate SIFT detector self.sift = sift = cv2.xfeatures2d.SIFT_create() for f in self.files: if file_type == "pdf": imgs = self.pdf2img(f) else: img_t = cv2.imread(f) # trainImage for tmplt in self.templates: img_q = cv2.imread(tmplt) # queryImage good = [] print("# searching for {} in {}".format(tmplt, f)) if file_type == "pdf": tmp2 = "" for p_img in imgs: img_t = p_img # trainImage matches = self.sift_run(img_q, img_t) # ratio test as per Lowe's paper for m, n in matches: if m.distance < 0.5 * n.distance: good.append([m]) else: matches = self.sift_run(img_q, img_t) # Apply ratio test as per Lowe's paper for m, n in matches: if m.distance < 0.5 * n.distance: good.append([m]) if good != []: db_res = self.db_server.query(db_ic_i, ["class"], query_key="_id", query_value=f) #print(type(db_res), db_res) db_res2 = [] for row in db_res: for _ in row.key[0]: db_res2.append(_) #print(type(db_res2), db_res2) db_res3 = set(db_res2) #print(type(db_res3), db_res3) if tmplt not in db_res3: db_res3.add(tmplt) self.db_server.save(db_ic_i, {'class': list(db_res3)}, doc_id=f)
config.read('bot.ini') print('bot.ini loaded.') #Convert config to simple dict for ease of use: strings = {} for section in config.sections(): for tup in config.items(section): strings.update({tup[0]: tup[1]}) b = datetime.datetime.now() print('\tbot.ini integrated in bot.') with open('moves.json', 'r') as f: dic = json.load(f) print('Moves loaded.') db_handler = db_handler() counter = counter.counter(datetime.datetime.now()) initialized = None #--------------Events ------------------ @bot.event async def on_ready(): """Lets Tim know the bot loaded properly""" b = datetime.datetime.now() s = 'Bot is initialized after ' + str(str((b - a).seconds)) s += '.' + str((b - a).microseconds)[3:] + 's.' print(s)
def __init__(self, files, templates, file_type = None): self.files = files self.templates = templates self.results = [] self.db_server = db_handler() res = self.db_server.query(db_sh,["term"],query_key="_id", query_value="img_in_img") #print(type(res), res) res2 = [] for row in res: print(row) #for _ in row.key[0]: for _ in row['term']: res2.append(_) #print(type(res2), res2) res3 = set(res2) #print(type(res3), res3) for tmplt in templates: if tmplt not in res3: res3.add(tmplt) self.db_server.save(db_sh,{'term' : list(res3)}, doc_id = "img_in_img") # Initiate SIFT detector self.sift = cv2.xfeatures2d.SIFT_create() # Initiate SURF detector self.surf = cv2.xfeatures2d.SURF_create(400) # Hessian Threshold to 300-500 # Initiate BFMatcher self.bf = cv2.BFMatcher(normType=cv2.NORM_L2, crossCheck=False) self.algo = "surf" for f in self.files: if file_type == "pdf": imgs = self.pdf2img(f) else: img_t = cv2.imread(f) # trainImage for tmplt in self.templates: img_q = cv2.imread(tmplt) # queryImage good = [] # get descriptors of query image kps_q, descs_q = self.get_desc(img_q, self.algo) print("# searching for {} in {}".format(tmplt, f)) if file_type == "pdf" and imgs != []: for p_img in imgs: img_t = p_img # trainImage kps_t, descs_t = self.get_desc(img_t, self.algo) if descs_t is not None: matches = self.get_matches(descs_q, descs_t) # ratio test as per Lowe's paper if matches is not None: for m,n in matches: if m.distance < 0.5*n.distance: good.append([m]) else: kps_t, descs_t = self.get_desc(img_t, self.algo) if descs_t is not None: matches = self.get_matches(descs_q, descs_t) # ratio test as per Lowe's paper if matches is not None: for m,n in matches: if m.distance < 0.5*n.distance: good.append([m]) if good != []: db_res = self.db_server.query(db_ic_i,["class"],query_key="_id", query_value=f) #print(type(db_res), db_res) db_res2 = [] for row in db_res: for _ in row['class']: db_res2.append(_) #print(type(db_res2), db_res2) db_res3 = set(db_res2) #print(type(db_res3), db_res3) if tmplt not in db_res3: db_res3.add(tmplt) self.db_server.save(db_ic_i,{'class' : list(db_res3)}, doc_id = f)
def __init__(self, files, file_format, method=None): self.files = files self.files_opened = [] for f in self.files: self.files_opened.append(OpenFile(f)) self.docLabels = [] self.db_server = db_handler() for doc in self.files_opened: self.docLabels.append(doc.location) self.algo = "dbscan" # create a list data that stores the content of all text files in order of their names in docLabels data = [] if file_format == "docx" or file_format == "pptx": for doc in self.files_opened: #data.append(open(doc, encoding='latin-1').read()) db = db_ds data.append(doc.text) elif file_format == "xlsx": for i, doc in enumerate(self.files_opened): #data.append(open(doc, encoding='latin-1').read()) db = db_xs try: data.append(json.dumps(doc.tables, skipkeys=True)) except: print("error parsing document {}".format( self.docLabels[i])) data.append("") data = nlp_clean(data) if method == "fuzzywuzzy": for i, f1 in enumerate(data): for f2 in data[i + 1:]: # print(self.docLabels[i],self.docLabels[i+1]) x = fuzz.ratio(f1, f2) y = fuzz.partial_ratio(f1, f2) print( "overall similarity ration: {} %\npartial similarity ration: {}" .format(x, y)) db_data = { 'dok_id': { 'dok_1': self.docLabels[i], 'dok_2': self.docLabels[i + 1] }, 'kullanici': user_default, 'overall similarity ratio': x, 'partial similarity ratio': y } self.db_server.save(db, db_data, doc_id=self.docLabels[i] + "_" + self.docLabels[i + 1]) elif method == "inference": #res = self.db_server.query(db_gensim,["_attachments"],query_key="_id", query_value=file_format) #model_loc ="{}gensim_models/docx/models/doc2vec_{}.model".format(server_default,file_format) model_loc = "models/doc2vec_{}.model".format(file_format) # loading the model d2v_model = gensim.models.doc2vec.Doc2Vec.load(model_loc) # d2v_model.init_sims(replace=False) # infer_vector is non-deterministic; i.e. the resulting vector is different each time, but it should be similar enough with a good model infervec = d2v_model.infer_vector(data[0], alpha=0.025, min_alpha=0.025, steps=300) similar_doc = d2v_model.docvecs.most_similar([infervec]) most_similar = similar_doc[0][0] print(type(most_similar)) print("most similar: {}".format(most_similar)) #db_res = self.db_server.query(db_dc,["_id","docs"]) db_res = self.db_server.query(db_dc, ["docs", "clusters"], query_key="_id", query_value=file_format) print(db_res) db_res_a = [] db_res_b = [] for row in db_res: # db_res_a.append(row) for a in row.key[0]: db_res_a.append(a) for b in row.key[1]: db_res_b.append(b) # print(db_res_a) # print(db_res_b) most_similar_class = db_res_b[db_res_a.index(most_similar)] print("most likely class: {}".format(most_similar_class)) print("other documents in same category") for i in range(len(db_res_b)): if db_res_b[i] == most_similar_class: print(db_res_a[i]) else: # iterator returned over all documents it = LabeledLineSentence(data, self.docLabels) model = gensim.models.Doc2Vec(vector_size=300, min_count=0, alpha=0.025, min_alpha=0.025) model.build_vocab(it) # training of model for epoch in range(100): #print ('iteration '+str(epoch+1)) model.train(it, total_examples=model.corpus_count, epochs=3) model.alpha -= 0.002 model.min_alpha = model.alpha model.save('models/doc2vec_{}.model'.format(file_format)) db_g = db_gensim db_data = {"time": "time", "path": dataset_path} self.db_server.save( db_g, db_data, doc_id=file_format, attachment='models/doc2vec_{}.model'.format(file_format)) print("model saved") # loading the model d2v_model = gensim.models.doc2vec.Doc2Vec.load( 'models/doc2vec_{}.model'.format(file_format)) # start testing X = [] # printing the vector of documents in docLabels for i, _ in enumerate(self.docLabels): docvec = d2v_model.docvecs[i] # print(docvec) X.append(docvec) X = np.array(X) #docvec = d2v_model.docvecs[0] #print (docvec) #docvec = d2v_model.docvecs[1] #print (docvec) # to get most similar document with similarity scores using document-index #similar_doc = d2v_model.docvecs.most_similar(0) # print(similar_doc) # for doc in similar_doc: # db_data = {'dok_id' : {'dok_1' : self.docLabels[0],'dok_2' : doc[0]}, 'kullanici': user_default, 'benzerlik orani': str(doc[1])} # self.db_server.save(db, db_data) #similar_doc = d2v_model.docvecs.most_similar(1) # print(similar_doc) # printing the vector of the file using its name # docvec = d2v_model.docvecs['shakespeare-hamlet.txt'] #if string tag used in training # print(docvec) # to get most similar document with similarity scores using document- name #sims = d2v_model.docvecs.most_similar('shakespeare-hamlet.txt') # print(sims) # ############################################################################# # Compute Affinity if self.algo == "aff": af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ n_clusters_ = len(cluster_centers_indices) labels = af.labels_ elif self.algo == "dbscan": #trying DBScan instead X = StandardScaler().fit_transform(X) af = DBSCAN(eps=3, min_samples=2).fit(X) core_samples_mask = np.zeros_like(af.labels_, dtype=bool) core_samples_mask[af.core_sample_indices_] = True labels = af.labels_ unique_labels = set(labels) n_clusters_ = len(unique_labels) #labels2 = [] # for i, lb in enumerate(labels): # labels2.append(self.files[i].split('/')[-1]) #print("labels: {}".format(labels)) #print("labels2: {}".format(labels2)) print("number of clusters: {}".format(n_clusters_)) dic = {i: np.where(labels == i)[0] for i in range(n_clusters_)} dic2 = {} # print(dic) for key, value in dic.items(): print("cluster {}:".format(key)) for e in value: print("{} : {}".format(e, self.files[e].split('/')[-1])) dic2[self.docLabels[e]] = key print(dic2) # print('Estimated number of clusters: %d' % n_clusters_) # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # print("Adjusted Rand Index: %0.3f" # % metrics.adjusted_rand_score(labels_true, labels)) # print("Adjusted Mutual Information: %0.3f" # % metrics.adjusted_mutual_info_score(labels_true, labels)) #print("Silhouette Coefficient: %0.3f" # % metrics.silhouette_score(X, labels, metric='sqeuclidean')) # ############################################################################# # Plot result import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from itertools import cycle plt.close('all') plt.figure(figsize=(25, 10)) plt.clf() # reduce dimensions # pca = PCA(n_components=2) # reduced = pca.fit_transform(X) # X = reduced if self.algo == "aff": colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title( 'Clustering with Affinity Propagation | Estimated number of clusters: %d' % n_clusters_) plt.savefig( 'models/{}_affinity_clusters.png'.format(file_format), dpi=300) elif self.algo == "dbscan": colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.title( 'Clustering with DBScan | Estimated number of clusters: %d' % n_clusters_) plt.savefig( 'models/{}_dbscan_clusters.png'.format(file_format), dpi=300) plt.show() #db = db_dc db_data = dic2 db_data["docs"] = self.docLabels db_data["clusters"] = labels.tolist() self.db_server.save( db_dc, db_data, doc_id=file_format, attachment='models/{}_affinity_clusters.png'.format( file_format)) # ######################### # hierarchical linkage_matrix = [] #linkage_matrix.append(linkage(X, method='single', metric='euclidean')) linkage_matrix.append( linkage(X, method='average', metric='euclidean')) #linkage_matrix.append(linkage(X, method='complete', metric='euclidean')) #linkage_matrix.append(linkage(X, method='ward', metric='euclidean')) #linkage_matrix.append(linkage(X, method='single', metric='seuclidean')) # linkage_matrix.append(linkage(X, method='average', metric='seuclidean')) #linkage_matrix.append(linkage(X, method='complete', metric='seuclidean')) for n, l in enumerate(linkage_matrix): # calculate full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.ylabel('word') plt.xlabel('distance') dendrogram( l, leaf_rotation=0., # rotates the x axis labels leaf_font_size=16., # font size for the x axis labels orientation='left', leaf_label_func=lambda v: str(self.files[v].split('/')[-1]) ) # plt.savefig('clusters_{}.png'.format(n), dpi=200) #save figure as ward_clusters plt.savefig( 'models/{}_hierarchical_clusters.png'.format(file_format), dpi=300) plt.show() db_data = {} self.db_server.save( db_dc, db_data, doc_id=file_format, attachment='models/{}_hierarchical_clusters.png'.format( file_format))
def clear_db(self): self.statusBar().showMessage('Clearing Database') db_server = db_handler() db_server.delete_all() self.statusBar().showMessage('Database Clearing Done')