Esempio n. 1
0
def read_data_from_file(ifile, format='txt'):
    if format == 'txt':
        try:
            with open(ifile, 'r') as f:
                data = f.read()
                replaced = []
                for i, letter in enumerate(data):
                    if letter in ['\n', '\t', '\r']:
                        replaced.append([i, letter])
                data = data.replace('\n', ' ').replace('\t',
                                                       ' ').replace('\r', ' ')
                return [data, replaced]
        except FileNotFoundError as fnf_error:
            exit(fnf_error)
    else:
        tempfile = ifile[0:len(ifile) - 4] + '_temp.xml'
        command = 'odf2xml ' + '-o ' + tempfile + ' ' + ifile
        runShell(command)
        with open(tempfile, mode='r', encoding='utf-8') as f:
            data = f.read()
            replaced = []
            for i, letter in enumerate(data):
                if letter in ['\n', '\t', '\r']:
                    replaced.append([i, letter])
            data = data.replace('\n', ' ').replace('\t',
                                                   ' ').replace('\r', ' ')
        remove_file_command = 'rm ' + tempfile
        runShell(remove_file_command)
        return [data, replaced]
Esempio n. 2
0
def clear_documents_on_redirect(request):
    path = os.path.dirname(__file__)
    rel_path = '../upload_file/documents/'
    abs_path = os.path.join(path, rel_path)
    command = 'cd ' + abs_path + ';rm -r *;'

    runShell(command=command)
Esempio n. 3
0
def find_entities(ifile,
                  ofile,
                  method='delete',
                  patterns_file='patterns.json',
                  verbose=False,
                  words_array=[],
                  exclude_array=[]):

    in_order = True
    # spacy -- init
    #

    # nlp = spacy.load('el_core_news_sm')
    # matcher = Matcher(nlp.vocab)

    # Check file extension
    extension = ifile[-3:]
    if extension == 'odt':
        [data, replaced] = read_data_from_file(ifile=ifile, format='odt')
    elif extension == 'txt':
        [data, replaced] = read_data_from_file(ifile=ifile, format='txt')
    else:
        raise NameError('find_entities: Not extension .txt or .odt')
    # doc = nlp(data)
    # data = str(doc)

    # READ CONFIGURATION FILE
    #
    patterns_json = read_patterns(patterns_file)
    '''
        --- ENTITY LIST EXPLANATION ---
        entities = [entity_name, entity_value,
            span/word, start, end, found_by_spacy]

        We will use found_by_spacy bool to access data either via
        doc[start:end] if True else str(doc)[start:end] .

        Span/word is the word just the way it was found into the text
        while entity_value is the value extracted through specific
        algorithms each time.

        Some times these to might have the same value.
    '''
    entities = []

    for matcher, value in patterns_json['matcher'].items():
        if value['active'] == 'False':
            continue
        custom_pattern_method = getattr(matcher_patterns, matcher)
        # Call function with the proper parameters
        results = custom_pattern_method(data=data,
                                        pattern=fix_pattern(value['pattern']))
        if results != None:
            entities += results

    # Words Array , Custom word search in text
    for word in words_array:
        [method_type, symbol, length_replace] = method
        if len(symbol) > 1:
            symbol = symbol[0]
        if symbol in word:
            # Make anonymize something like '9 **** 2019'
            # Semi-anionymized
            splitted_word = word.split(symbol)
            pattern = r''
            for item in splitted_word:
                # Find . (anything) instead of *
                pattern += item + '.'
            pattern = pattern[0:len(pattern) - 1]

        else:
            pattern = word

        results = matcher_patterns.custom_words(data=data, word=pattern)
        if results != None:
            entities += results

    if in_order == True:
        entities.sort(key=sort_by_start)

    unique_values = True
    if unique_values == True:
        final_entities = []
        for entity in entities:
            if entity not in final_entities:
                final_entities.append(entity)
        entities = final_entities

    if verbose:
        # Display
        print(colored(f'\n\n-------------File:{ifile}-------------', 'green'))
        for element in entities:
            print(
                '[',
                colored(element[0], 'yellow'),
                ',',
                colored(element[1], 'blue'),
                ',',
                colored(element[2], 'cyan'),
                ',',
                element[3],
                ',',
                element[4],
                ']',
            )

    # Anonymize entities by removing them
    # from the original file

    final_text = ''
    index = 0
    previous_e = 0

    entities = Excluder.exclude(entities, exclude_array)
    for element in entities:
        span = element[2]
        s = element[3]
        e = element[4]
        # Brand elements may have html tags in .odt files
        if element[0] == 'brand_name' and ifile[-3:] == 'odt':
            final_text += data[index:s]
            index = s
            previous_e = e
            import re
            regex = r'(<.*?>)'
            tags = []
            for match in re.finditer(regex, span):
                st = match.start()
                end = match.end()
                sp = span[st:end]
                tags.append([st, end, sp])
            if tags == []:
                final_text += data[index:s] + \
                    anonymize_element(element, method)
                previous_e = e
                index = e
                continue
            temp_index = index
            for tag in tags:
                st = tag[0]
                end = tag[1]
                sp = tag[2]
                temp_data = data[temp_index:index + st]
                temp_element = [
                    'brand_name',
                    temp_data.upper(), temp_data, temp_index, index + st, False
                ]

                final_text += anonymize_element(temp_element, method)
                final_text += sp
                temp_index = index + end
            temp_data = data[temp_index:e]
            temp_element = [
                'brand_name',
                temp_data.upper(), temp_data, temp_index, e, False
            ]

            final_text += anonymize_element(temp_element, method)
            # index = previous_e
            # index = e
            index = e
            continue
        if previous_e >= e:
            # currenct element is substring of the previous
            index = previous_e
            continue
        elif (s >= previous_e):
            # Common case
            final_text += data[index:s] + anonymize_element(element, method)
            previous_e = e
            index = e
        else:
            # previous and current element have both a common substring
            # f'Weird case span_trimmed:{data[previous_e:e]},span:{span}')
            temp_element = [
                element[0], element[1], data[previous_e:e], previous_e, e,
                element[5]
            ]
            final_text += anonymize_element(temp_element, method)
            previous_e = e
            index = e
    if index < len(data):
        final_text += data[index:len(data)]

    # Get the original new lines
    for i, letter in replaced:
        final_text = final_text[:i] + letter + final_text[i + 1:]

    with open(ofile, mode='w') as of:
        of.write(final_text)

    if ifile[-3:] == 'odt':
        # Create an odt file
        # Remove the file above
        command = 'xml2odf -o ' + ofile + ' ' + ofile
        runShell(command)
def anonymize_file(id='',
                   user_folder='default',
                   files_folder='files',
                   custom_words='',
                   text='',
                   download=False,
                   updateTextIfPossible=False,
                   rerender_text=True):

    obj_file = Document.objects.get(id=id)
    filename = str(obj_file)
    file = os.path.join(
        os.path.dirname(__file__),
        'documents/' + user_folder + '/' + files_folder + '/' + filename)
    directory = os.path.dirname(__file__)
    command = 'cd ' + directory
    runShell(command)
    command = 'cd ..'
    runShell(command)
    file_type = filename[-3:]
    if (custom_words[0] == ','):
        custom_words = custom_words[1:]

    if file_type == 'odt':

        file_name = filename
        l = len(file_name)
        anonymized_file_name = file_name[0:(
            l - 4)] + '_anonymized' + file_name[(l - 4):l]
        anonymized_file = os.path.join(
            os.path.dirname(__file__),
            'documents/' + user_folder + '/' + anonymized_file_name)

        # Convert odt to text just to preview
        # text = ///
        file_error = False
        anonymized_document_name = file_name[0:len(file_name) -
                                             4] + '_anonymized.odt'
        if download == False:
            # Preview the txt file
            tempname = 'temp_' + file_name[0:len(file_name) - 4] + '.txt'
            temp_file = os.path.join(
                os.path.dirname(__file__),
                'documents/' + user_folder + '/' + tempname)

            # Check if file exists already or force update
            if not os.path.isfile(temp_file) or updateTextIfPossible:
                command = 'odt2txt ' + file + ' --output=' + temp_file
                runShell(command)

            custom_words_option = (" -w '" + custom_words +
                                   "'") if custom_words != '' else ''

            anonymized_file_name = tempname[0:(len(tempname) -
                                               4)] + '_anonymized.txt'
            anonymized_file = os.path.join(
                os.path.dirname(__file__),
                'documents/' + user_folder + '/' + anonymized_file_name)

            # Check if file exists already or force update
            if not os.path.isfile(
                    anonymized_file) or updateTextIfPossible or rerender_text:
                command = ('python3 -m anonymizer_service' +
                           ' -i upload_file/documents/' + user_folder + '/' +
                           tempname + custom_words_option)
                runShell(command)

            with open(temp_file, mode='r') as f:
                text = f.read()
            # Always anonymize
            with open(anonymized_file, mode='r') as f:
                text_anonymized = f.read()
        else:
            custom_words_option = (" -w '" + custom_words +
                                   "'") if custom_words != '' else ''
            command = ('python3 -m anonymizer_service' + ' -i ' + file +
                       ' -o ' + 'upload_file/documents/' + user_folder + '/' +
                       anonymized_document_name + custom_words_option)
            runShell(command)

            anonymized_file_name = file_name[0:(len(file_name) -
                                                4)] + '_anonymized.txt'
            anonymized_file = os.path.join(
                os.path.dirname(__file__),
                'documents/' + user_folder + '/' + anonymized_file_name)
            text = ''
            text_anonymized = ''
            return [{}, {}]

    elif file_type == 'txt':
        text = Document.objects.filter(id=1)
        file_name = filename
        l = len(file_name)
        anonymized_file_name = file_name[0:(
            l - 4)] + '_anonymized' + file_name[(l - 4):l]
        anonymized_file = os.path.join(
            os.path.dirname(__file__),
            'documents/' + user_folder + '/' + anonymized_file_name)

        # Check if file exists already or force update
        if not os.path.isfile(
                anonymized_file) or updateTextIfPossible or rerender_text:
            command = ('python3 -m anonymizer_service -i ' + file +
                       ' -o upload_file/documents/' + user_folder + '/' +
                       anonymized_file_name + " -w '" + custom_words + "'")
            runShell(command)

        with open(file, mode='r') as f:
            text = f.read()
        with open(anonymized_file, mode='r') as f:
            text_anonymized = f.read()
        file_error = False
        anonymized_document_name = anonymized_file_name
    else:
        text = 'This file can not be supported.'
        file_error = True
        anonymized_document_name = 'FAILED'
        text_anonymized = ''

    document = {
        'name': filename,
        'text': text,
        'type': file_type,
        'user_folder': user_folder,
        'files_folder': files_folder,
        'error': file_error
    }

    document_anonymized = {
        'name': anonymized_document_name,
        'text': text_anonymized,
        'type': file_type,
        'user_folder': user_folder,
        'files_folder': files_folder,
        'error': file_error
    }

    return [document, document_anonymized]
Esempio n. 5
0
    def _on_connect(self, conn, remote):
        """Connection with client has been established"""

        # Initialize connection error value and send a connection start response
        conn_error = False
        conn.send(self.RESPONSE_START)

        while True:
            self.input_error = False
            msg_queue = ""

            # Waits for client to send something...
            try:
                data = conn.recv(1024)
            except socket.error:
                conn_error = self.ERROR_CON_DIED
                break

            # No data received, terminate connection
            if not data:
                conn_error = self.ERROR_CON_LOST
                break

            # Splits data by space character. command = data[0], args = data[1:]
            data = str(data, "utf-8").strip().split(" ")
            command = data[0]

            if len(data) == 2:
                args = data[1]
            else:
                args = None

            if command == "GOTO":
                if args is None:
                    self.input_error = self.ERROR_BAD_ARGS
                else:
                    coords = self.parse_coords(args)
                    pag.moveTo(coords[0], coords[1], self.MOUSE_MOVE_SPEED) \
                        if type(coords[0]) is float else self._set_input_error(self.ERROR_BAD_ARGS)

            elif command == "MOVE":
                if args is None:
                    self.input_error = self.ERROR_BAD_ARGS
                else:
                    coords = self.parse_coords(args)
                    pag.moveRel(coords[0], coords[1], self.MOUSE_MOVE_SPEED) \
                        if type(coords[0]) is float else self._set_input_error(self.ERROR_BAD_ARGS)

            elif command == "CLICK":
                pag.click()

            elif command == "DOWN":
                pag.mouseDown()

            elif command == "UP":
                pag.mouseUp()

            elif command == "VDOWN":
                pag.press("volumedown")

            elif command == "VUP":
                pag.press("volumeup")

            elif command == "VMUTE":
                pag.press("volumemute")

            elif command == "EXIT":
                pag.hotkey("alt", "f4")

            elif command == "SLEEP":
                print("Terminating connection with " + remote[0] +
                      "\nGoing to sleep...")
                conn.send(self.RESPONSE_CLOSE)
                conn.close()
                runShell(
                    "C:\\Users\\johng\\PycharmProjects\\MouseServer\\PSTools\\psshutdown.exe -d -f -t 0"
                )
                break

            elif command == "RCLICK":
                pag.click(button="right")

            elif command == "SEND":
                if args is None:
                    self.input_error = self.ERROR_BAD_ARGS
                else:
                    if args == "8":
                        key = "backspace"
                    else:
                        key = chr(int(args))
                    pag.press(key)

            elif command == "HELP":
                msg_queue = """\
Mouse Server v0.1
Available Commands:
GOTO
MOVE
CLICK
RCLICK
SEND
CLOSE
HELP
"""

            elif command == "CLOSE":
                print("Connection ended per client request")
                conn.send(self.RESPONSE_CLOSE)
                conn.close()
                break

            # Invalid command
            else:
                self.input_error = self.ERROR_BAD_COMMAND

            # Responds to client request with a 0 for OKAY and 1 for ERROR
            if self.input_error is not False:
                conn.send(self.RESPONSE_BAD +
                          bytes(self._get_error(self.input_error), "UTF-8"))
            else:
                conn.send(self.RESPONSE_GOOD + bytes(msg_queue, "UTF-8"))

        # Connection ended, if there was an error report it
        if conn_error is not False:
            print("Input error: " + self._get_error(conn_error))