def read_data_from_file(ifile, format='txt'): if format == 'txt': try: with open(ifile, 'r') as f: data = f.read() replaced = [] for i, letter in enumerate(data): if letter in ['\n', '\t', '\r']: replaced.append([i, letter]) data = data.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') return [data, replaced] except FileNotFoundError as fnf_error: exit(fnf_error) else: tempfile = ifile[0:len(ifile) - 4] + '_temp.xml' command = 'odf2xml ' + '-o ' + tempfile + ' ' + ifile runShell(command) with open(tempfile, mode='r', encoding='utf-8') as f: data = f.read() replaced = [] for i, letter in enumerate(data): if letter in ['\n', '\t', '\r']: replaced.append([i, letter]) data = data.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') remove_file_command = 'rm ' + tempfile runShell(remove_file_command) return [data, replaced]
def clear_documents_on_redirect(request): path = os.path.dirname(__file__) rel_path = '../upload_file/documents/' abs_path = os.path.join(path, rel_path) command = 'cd ' + abs_path + ';rm -r *;' runShell(command=command)
def find_entities(ifile, ofile, method='delete', patterns_file='patterns.json', verbose=False, words_array=[], exclude_array=[]): in_order = True # spacy -- init # # nlp = spacy.load('el_core_news_sm') # matcher = Matcher(nlp.vocab) # Check file extension extension = ifile[-3:] if extension == 'odt': [data, replaced] = read_data_from_file(ifile=ifile, format='odt') elif extension == 'txt': [data, replaced] = read_data_from_file(ifile=ifile, format='txt') else: raise NameError('find_entities: Not extension .txt or .odt') # doc = nlp(data) # data = str(doc) # READ CONFIGURATION FILE # patterns_json = read_patterns(patterns_file) ''' --- ENTITY LIST EXPLANATION --- entities = [entity_name, entity_value, span/word, start, end, found_by_spacy] We will use found_by_spacy bool to access data either via doc[start:end] if True else str(doc)[start:end] . Span/word is the word just the way it was found into the text while entity_value is the value extracted through specific algorithms each time. Some times these to might have the same value. ''' entities = [] for matcher, value in patterns_json['matcher'].items(): if value['active'] == 'False': continue custom_pattern_method = getattr(matcher_patterns, matcher) # Call function with the proper parameters results = custom_pattern_method(data=data, pattern=fix_pattern(value['pattern'])) if results != None: entities += results # Words Array , Custom word search in text for word in words_array: [method_type, symbol, length_replace] = method if len(symbol) > 1: symbol = symbol[0] if symbol in word: # Make anonymize something like '9 **** 2019' # Semi-anionymized splitted_word = word.split(symbol) pattern = r'' for item in splitted_word: # Find . (anything) instead of * pattern += item + '.' pattern = pattern[0:len(pattern) - 1] else: pattern = word results = matcher_patterns.custom_words(data=data, word=pattern) if results != None: entities += results if in_order == True: entities.sort(key=sort_by_start) unique_values = True if unique_values == True: final_entities = [] for entity in entities: if entity not in final_entities: final_entities.append(entity) entities = final_entities if verbose: # Display print(colored(f'\n\n-------------File:{ifile}-------------', 'green')) for element in entities: print( '[', colored(element[0], 'yellow'), ',', colored(element[1], 'blue'), ',', colored(element[2], 'cyan'), ',', element[3], ',', element[4], ']', ) # Anonymize entities by removing them # from the original file final_text = '' index = 0 previous_e = 0 entities = Excluder.exclude(entities, exclude_array) for element in entities: span = element[2] s = element[3] e = element[4] # Brand elements may have html tags in .odt files if element[0] == 'brand_name' and ifile[-3:] == 'odt': final_text += data[index:s] index = s previous_e = e import re regex = r'(<.*?>)' tags = [] for match in re.finditer(regex, span): st = match.start() end = match.end() sp = span[st:end] tags.append([st, end, sp]) if tags == []: final_text += data[index:s] + \ anonymize_element(element, method) previous_e = e index = e continue temp_index = index for tag in tags: st = tag[0] end = tag[1] sp = tag[2] temp_data = data[temp_index:index + st] temp_element = [ 'brand_name', temp_data.upper(), temp_data, temp_index, index + st, False ] final_text += anonymize_element(temp_element, method) final_text += sp temp_index = index + end temp_data = data[temp_index:e] temp_element = [ 'brand_name', temp_data.upper(), temp_data, temp_index, e, False ] final_text += anonymize_element(temp_element, method) # index = previous_e # index = e index = e continue if previous_e >= e: # currenct element is substring of the previous index = previous_e continue elif (s >= previous_e): # Common case final_text += data[index:s] + anonymize_element(element, method) previous_e = e index = e else: # previous and current element have both a common substring # f'Weird case span_trimmed:{data[previous_e:e]},span:{span}') temp_element = [ element[0], element[1], data[previous_e:e], previous_e, e, element[5] ] final_text += anonymize_element(temp_element, method) previous_e = e index = e if index < len(data): final_text += data[index:len(data)] # Get the original new lines for i, letter in replaced: final_text = final_text[:i] + letter + final_text[i + 1:] with open(ofile, mode='w') as of: of.write(final_text) if ifile[-3:] == 'odt': # Create an odt file # Remove the file above command = 'xml2odf -o ' + ofile + ' ' + ofile runShell(command)
def anonymize_file(id='', user_folder='default', files_folder='files', custom_words='', text='', download=False, updateTextIfPossible=False, rerender_text=True): obj_file = Document.objects.get(id=id) filename = str(obj_file) file = os.path.join( os.path.dirname(__file__), 'documents/' + user_folder + '/' + files_folder + '/' + filename) directory = os.path.dirname(__file__) command = 'cd ' + directory runShell(command) command = 'cd ..' runShell(command) file_type = filename[-3:] if (custom_words[0] == ','): custom_words = custom_words[1:] if file_type == 'odt': file_name = filename l = len(file_name) anonymized_file_name = file_name[0:( l - 4)] + '_anonymized' + file_name[(l - 4):l] anonymized_file = os.path.join( os.path.dirname(__file__), 'documents/' + user_folder + '/' + anonymized_file_name) # Convert odt to text just to preview # text = /// file_error = False anonymized_document_name = file_name[0:len(file_name) - 4] + '_anonymized.odt' if download == False: # Preview the txt file tempname = 'temp_' + file_name[0:len(file_name) - 4] + '.txt' temp_file = os.path.join( os.path.dirname(__file__), 'documents/' + user_folder + '/' + tempname) # Check if file exists already or force update if not os.path.isfile(temp_file) or updateTextIfPossible: command = 'odt2txt ' + file + ' --output=' + temp_file runShell(command) custom_words_option = (" -w '" + custom_words + "'") if custom_words != '' else '' anonymized_file_name = tempname[0:(len(tempname) - 4)] + '_anonymized.txt' anonymized_file = os.path.join( os.path.dirname(__file__), 'documents/' + user_folder + '/' + anonymized_file_name) # Check if file exists already or force update if not os.path.isfile( anonymized_file) or updateTextIfPossible or rerender_text: command = ('python3 -m anonymizer_service' + ' -i upload_file/documents/' + user_folder + '/' + tempname + custom_words_option) runShell(command) with open(temp_file, mode='r') as f: text = f.read() # Always anonymize with open(anonymized_file, mode='r') as f: text_anonymized = f.read() else: custom_words_option = (" -w '" + custom_words + "'") if custom_words != '' else '' command = ('python3 -m anonymizer_service' + ' -i ' + file + ' -o ' + 'upload_file/documents/' + user_folder + '/' + anonymized_document_name + custom_words_option) runShell(command) anonymized_file_name = file_name[0:(len(file_name) - 4)] + '_anonymized.txt' anonymized_file = os.path.join( os.path.dirname(__file__), 'documents/' + user_folder + '/' + anonymized_file_name) text = '' text_anonymized = '' return [{}, {}] elif file_type == 'txt': text = Document.objects.filter(id=1) file_name = filename l = len(file_name) anonymized_file_name = file_name[0:( l - 4)] + '_anonymized' + file_name[(l - 4):l] anonymized_file = os.path.join( os.path.dirname(__file__), 'documents/' + user_folder + '/' + anonymized_file_name) # Check if file exists already or force update if not os.path.isfile( anonymized_file) or updateTextIfPossible or rerender_text: command = ('python3 -m anonymizer_service -i ' + file + ' -o upload_file/documents/' + user_folder + '/' + anonymized_file_name + " -w '" + custom_words + "'") runShell(command) with open(file, mode='r') as f: text = f.read() with open(anonymized_file, mode='r') as f: text_anonymized = f.read() file_error = False anonymized_document_name = anonymized_file_name else: text = 'This file can not be supported.' file_error = True anonymized_document_name = 'FAILED' text_anonymized = '' document = { 'name': filename, 'text': text, 'type': file_type, 'user_folder': user_folder, 'files_folder': files_folder, 'error': file_error } document_anonymized = { 'name': anonymized_document_name, 'text': text_anonymized, 'type': file_type, 'user_folder': user_folder, 'files_folder': files_folder, 'error': file_error } return [document, document_anonymized]
def _on_connect(self, conn, remote): """Connection with client has been established""" # Initialize connection error value and send a connection start response conn_error = False conn.send(self.RESPONSE_START) while True: self.input_error = False msg_queue = "" # Waits for client to send something... try: data = conn.recv(1024) except socket.error: conn_error = self.ERROR_CON_DIED break # No data received, terminate connection if not data: conn_error = self.ERROR_CON_LOST break # Splits data by space character. command = data[0], args = data[1:] data = str(data, "utf-8").strip().split(" ") command = data[0] if len(data) == 2: args = data[1] else: args = None if command == "GOTO": if args is None: self.input_error = self.ERROR_BAD_ARGS else: coords = self.parse_coords(args) pag.moveTo(coords[0], coords[1], self.MOUSE_MOVE_SPEED) \ if type(coords[0]) is float else self._set_input_error(self.ERROR_BAD_ARGS) elif command == "MOVE": if args is None: self.input_error = self.ERROR_BAD_ARGS else: coords = self.parse_coords(args) pag.moveRel(coords[0], coords[1], self.MOUSE_MOVE_SPEED) \ if type(coords[0]) is float else self._set_input_error(self.ERROR_BAD_ARGS) elif command == "CLICK": pag.click() elif command == "DOWN": pag.mouseDown() elif command == "UP": pag.mouseUp() elif command == "VDOWN": pag.press("volumedown") elif command == "VUP": pag.press("volumeup") elif command == "VMUTE": pag.press("volumemute") elif command == "EXIT": pag.hotkey("alt", "f4") elif command == "SLEEP": print("Terminating connection with " + remote[0] + "\nGoing to sleep...") conn.send(self.RESPONSE_CLOSE) conn.close() runShell( "C:\\Users\\johng\\PycharmProjects\\MouseServer\\PSTools\\psshutdown.exe -d -f -t 0" ) break elif command == "RCLICK": pag.click(button="right") elif command == "SEND": if args is None: self.input_error = self.ERROR_BAD_ARGS else: if args == "8": key = "backspace" else: key = chr(int(args)) pag.press(key) elif command == "HELP": msg_queue = """\ Mouse Server v0.1 Available Commands: GOTO MOVE CLICK RCLICK SEND CLOSE HELP """ elif command == "CLOSE": print("Connection ended per client request") conn.send(self.RESPONSE_CLOSE) conn.close() break # Invalid command else: self.input_error = self.ERROR_BAD_COMMAND # Responds to client request with a 0 for OKAY and 1 for ERROR if self.input_error is not False: conn.send(self.RESPONSE_BAD + bytes(self._get_error(self.input_error), "UTF-8")) else: conn.send(self.RESPONSE_GOOD + bytes(msg_queue, "UTF-8")) # Connection ended, if there was an error report it if conn_error is not False: print("Input error: " + self._get_error(conn_error))