class Preprocessing(object):

    # file location
    file_location = ''

    # the file lines
    lines = None

    # Location of gensim model that already has embedding matrix or location to save a new embedding matrix
    model_location = ''

    # gensim_model to be used
    gensim_model = None

    # Gensim_Maker object to be used
    gensim_maker_obj = None

    # the maximum number of words any one company's text is
    max_num_words = 0

    # the dictionary of a word and its one-hot encoding
    one_hot_dict = dict()

    def __init__(self, file_location, model_location, gensim_model = None, need_to_create_model = True):
        self.file_location = file_location
        self.model_location = model_location
        self.gensim_maker_obj = GensimMaker(file_location)
        if (need_to_create_model):
            self.gensim_maker_obj.generate_model()
            self.gensim_maker_obj.save_model(model_location)
        else:
            self.gensim_maker_obj.load_model(model_location)
        if gensim_model is None:
            self.gensim_model = self.gensim_maker_obj.get_model()

        with open(self.file_location) as file:
            self.lines = file.readlines()

        company_name_set = set()

        # get the maximum number of words in any file in the directory and make a list of company names
        max_length = float('inf') * -1
        for line in self.lines:
            line_split_list = line.split('\t')
            company_name = line_split_list[0]
            text_body = line_split_list[1]
            length = len(text_body.split())
            if length > max_length:
                max_length = length
            self.max_num_words = max_length
            company_name_set.add(company_name)
        self.one_hot_dict = self.create_one_hot_encoding_dictionary(company_name_set)

    # Converts line of a file to an array and pad it to a certain length
    def convert_line_of_file_to_array(self, line_number):
        num_dim_of_a_word = self.gensim_maker_obj.get_dimension_of_a_word()
        line = self.lines[line_number]
        text_body = line.split('\t')[1]
        big_array = np.zeros((num_dim_of_a_word, self.max_num_words))
        word_index = 0
        for word in text_body.split():
            try:
                new_word = self.gensim_model[word]
            # if a word isn't in the model then just leave it as a 0 vector
            except KeyError:
                word_index += 1
                continue
            big_array[:, word_index] = new_word
            word_index += 1
        big_array.view('float32')
        return big_array

    # creates the input matrix by making an array out of each file and stacking them on top of each other
    def create_input_matrix(self, list_of_line_numbers):
        matrix_to_return = np.empty((len(list_of_line_numbers), self.gensim_maker_obj.get_dimension_of_a_word(), self.max_num_words, 1))
        matrix_depth_index = 0
        for line_number in list_of_line_numbers:
            matrix_to_add = self.convert_line_of_file_to_array(line_number)
            matrix_to_return[matrix_depth_index, :, :, 0] = matrix_to_add
            matrix_depth_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # creates the correct answer matrix by vertically stacking the word vectors of the input list of answers
    def create_answer_matrix(self, list_of_line_numbers):
        matrix_to_return = np.empty((len(list_of_line_numbers), len(self.one_hot_dict)))
        matrix_row_index = 0
        for line_number in list_of_line_numbers:
            line = self.lines[line_number]
            company_name = line.split('\t')[0]
            matrix_to_return[matrix_row_index, :] = self.one_hot_dict[company_name]
            matrix_row_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # creates the dictionary for the one hot vectors
    def create_one_hot_encoding_dictionary(self, list_of_words):
        vector_length = len(list_of_words)
        dictionary = dict()
        index = 0
        for name in list_of_words:
            vector = np.zeros((1, vector_length))
            vector.view('float32')
            vector[0, index] = 1
            dictionary[name] = vector
            index += 1
        return dictionary

    # returns a list of training set, validation set, and test set as 4d numpy arrays as a list in the following order:
    # training input, training labels, validation input, validation labels, test input, test labels.
    # input is a list of 3 numbers which are the ratios of the training, validation, and test set, in order
    def get_data_set(self, ratios_list):
        num_lines_in_file = len(self.lines)
        num_lines_for_train_set = int(ratios_list[0] * num_lines_in_file)
        num_lines_for_valid_set = int(ratios_list[1] * num_lines_in_file)
        num_lines_for_test_set = num_lines_in_file - num_lines_for_train_set - num_lines_for_valid_set

        train_set_lines_list = list()
        valid_set_lines_list = list()
        test_set_lines_list = list()

        lines_list = list(range(num_lines_in_file))
        for count in range(num_lines_for_train_set):
            random_line = random.choice(lines_list)
            train_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_valid_set):
            random_line = random.choice(lines_list)
            valid_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_test_set):
            random_line = random.choice(lines_list)
            test_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        return [self.create_input_matrix(train_set_lines_list), self.create_answer_matrix(train_set_lines_list),
                self.create_input_matrix(valid_set_lines_list), self.create_answer_matrix(valid_set_lines_list),
                self.create_input_matrix(test_set_lines_list), self.create_answer_matrix(test_set_lines_list)]

    # removes the numbers and extension '.txt' from a string
    def remove_numbers_and_extension(self, word):
        output = word.replace('.txt', '')
        output = re.sub('([0-9]*)', '', output)
        return output
class SQLPreprocessor(object):

    # database location where data is gotten
    db_name = ''

    # name of the table to retrieve patents
    table_for_retrieving = ''

    # table where new data is written
    table_for_storing_info = ''

    # Location of gensim model that already has embedding matrix or location to save a new embedding matrix
    model_location = ''

    # gensim_model to be used
    gensim_model = None

    # Gensim_Maker object to be used
    gensim_maker_obj = None

    # the maximum number of words any one patent's text is
    max_num_words = 0

    # dimension of any word vector in the model
    word_vector_dimension = 0

    def __init__(self, db_name, table_for_retrieving, model_location, table_for_storing_info, gensim_maker_obj=None,
                 need_to_make_table=False, gensim_model=None, need_to_create_models=True):
        self.db_name = db_name
        self.model_location = model_location
        # To protect against injection nonsense
        self.table_for_retrieving = self.remove_non_alphachars(table_for_retrieving)
        self.table_for_storing_info = self.remove_non_alphachars(table_for_storing_info)
        if gensim_maker_obj is None:
            self.gensim_maker_obj = GensimMaker(db_name, sql_table_name=table_for_retrieving, use_SQL_sentence_maker=True)
        else:
            self.gensim_maker_obj = gensim_maker_obj
        if need_to_create_models:
            self.gensim_maker_obj.generate_model()
            self.gensim_maker_obj.save_model(model_location)
        else:
            self.gensim_maker_obj.load_model(model_location)
        if gensim_model is None:
            self.gensim_model = self.gensim_maker_obj.get_model()

        if need_to_make_table:
            conn = sqlite3.connect(self.db_name)

            conn.execute('''CREATE TABLE ''' + self.table_for_storing_info + ''' (PATENT_ID TEXT NOT NULL, FIELD_DATA TEXT NOT NULL);''')
            conn.close()
            self.generate_organized_db()

        self.word_vector_dimension = self.gensim_maker_obj.get_dimension_of_a_word()

        self.max_num_words = self.find_max_num_words()

    # gets the maximum number of words in any file in the directory and make a list of company names
    def find_max_num_words(self):
        max_length = float('inf') * -1
        conn = sqlite3.connect(self.db_name)
        all_patents = conn.execute("SELECT * FROM " + self.table_for_storing_info + ";")
        for line in all_patents:
            if '#!%$^&' in str(line[1]):
                curr_length = len(str(line[1]).split('#!%$^&'))
            else:
                curr_length = len(str(line[1]).split())
            if curr_length > max_length:
                max_length = curr_length
        conn.close()
        return max_length

    def remove_non_alphachars(self, string_to_clean):
            return ''.join(char for char in string_to_clean if char.isalnum()or char is '_')

    # Takes the db and makes a new one with patents organized by patent name in one column and field value in the other
    def generate_organized_db(self):

        conn = sqlite3.connect(self.db_name)
        all_patents = conn.execute("SELECT * FROM " + self.table_for_retrieving)

        line_to_write = ''
        current_patent_id = 'PLACEHOLDER'
        for line in all_patents:
            patent_id = str(line[0])
            patent_field_value = str(line[1].replace('\n', ''))
            if current_patent_id == 'PLACEHOLDER':
                current_patent_id = patent_id
                line_to_write += patent_field_value
                continue
            if patent_id != current_patent_id:
                params = (current_patent_id, line_to_write)
                conn.execute("INSERT INTO " + self.table_for_storing_info + " VALUES (?, ?);", params)
                current_patent_id = patent_id
                line_to_write = patent_field_value
            else:
                current_patent_id = patent_id
                line_to_write += '#!%$^&' + patent_field_value
        # so last line is not ignored
        params = (current_patent_id, line_to_write)
        conn.execute(
                "INSERT INTO " + self.table_for_storing_info + " VALUES (?, ?);", params)
        conn.commit()
        conn.close()

    # Converts the row of the generated db to an array and pad it to the maximum length text string in the old table
    def convert_row_of_db_to_array(self, line_number):
        num_dim_of_a_word = self.gensim_maker_obj.get_dimension_of_a_word()
        conn = sqlite3.connect(self.db_name)
        params = (str(line_number),)
        query = conn.execute("SELECT * FROM " + self.table_for_storing_info + " LIMIT 1 OFFSET ?;", params)
        line = ''
        for row in query:
            line = str(row[1])
        big_array = np.zeros((num_dim_of_a_word, self.max_num_words))
        word_index = 0
        if '#!%$^&' in line:
            for word in line.split('#!%$^&'):
                try:
                    new_word = self.gensim_model[word]
                # if a word isn't in the model then just leave it as a 0 vector
                except KeyError:
                    word_index += 1
                    continue
                big_array[:, word_index] = new_word
                word_index += 1
            big_array.view('float32')
            conn.close()
            return big_array
        else:
            for word in line.replace('.', '').split():
                try:
                    new_word = self.gensim_model[word]
                # if a word isn't in the model then just leave it as a 0 vector
                except KeyError:
                    word_index += 1
                    continue
                big_array[:, word_index] = new_word
                word_index += 1
            big_array.view('float32')
            conn.close()
            return big_array

    # creates the input matrix by making an array out of the text at that row in the generated db and stacks them on top of each other
    def create_input_matrix(self, list_of_line_numbers):
        matrix_to_return = np.empty((len(list_of_line_numbers), self.gensim_maker_obj.get_dimension_of_a_word(), self.max_num_words, 1))
        matrix_depth_index = 0
        for line_number in list_of_line_numbers:
            matrix_to_add = self.convert_row_of_db_to_array(line_number)
            matrix_to_return[matrix_depth_index, :, :, 0] = matrix_to_add
            matrix_depth_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # returns a list of training set, validation set, and test set as 4d numpy arrays as a list in the following order:
    # training input, validation input, test input.
    # input is a list of 3 numbers which are the ratios of the training, validation, and test set, in order
    def get_input_set(self, ratios_list):
        random_line_numbers_list = self.get_random_line_numbers(ratios_list)
        return self.get_input_set_given_line_numbers(random_line_numbers_list)

    def get_input_set_given_line_numbers(self, list_of_line_numbers):

        train_set_lines_list = list_of_line_numbers[0]
        valid_set_lines_list = list_of_line_numbers[1]
        test_set_lines_list = list_of_line_numbers[2]

        return [self.create_input_matrix(train_set_lines_list),
                self.create_input_matrix(valid_set_lines_list),
                self.create_input_matrix(test_set_lines_list)]

    # given ratios for training, validation, and test data, returns 3 lists each containing random line numbers from
    # the organized table assuming line 0 is the first row
    def get_random_line_numbers(self, ratios_list):
        conn = sqlite3.connect(self.db_name)
        count_lines_in_file = conn.execute("SELECT COUNT(*) FROM " + self.table_for_storing_info + ";")
        num_lines_in_file = -1
        for num in count_lines_in_file:
            num_lines_in_file = num[0]
        num_lines_for_train_set = int(ratios_list[0] * num_lines_in_file)
        num_lines_for_valid_set = int(ratios_list[1] * num_lines_in_file)
        num_lines_for_test_set = num_lines_in_file - num_lines_for_train_set - num_lines_for_valid_set

        train_set_lines_list = list()
        valid_set_lines_list = list()
        test_set_lines_list = list()

        lines_list = list(range(0, num_lines_in_file))
        for count in range(num_lines_for_train_set):
            random_line = random.choice(lines_list)
            train_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_valid_set):
            random_line = random.choice(lines_list)
            valid_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_test_set):
            random_line = random.choice(lines_list)
            test_set_lines_list.append(random_line)
            lines_list.remove(random_line)
        conn.close()

        return [train_set_lines_list, valid_set_lines_list, test_set_lines_list]

    # returns a list of patent ids given a list of line numbers in the organized patent file, assuming first row is
    # row 0
    def get_patent_ids_from_line_numbers(self, line_numbers_list):
        conn = sqlite3.connect(self.db_name)
        patent_id_list = list()

        for line in line_numbers_list:
            params = (str(line),)
            query = conn.execute(
                    "SELECT * FROM " + self.table_for_storing_info + " LIMIT 1 OFFSET ?;", params)
            for row in query:
                patent_id_list.append(str(row[0]))
        conn.close()
        return patent_id_list
Ejemplo n.º 3
0
class Preprocessor(object):

    # file location
    file_location = ''

    # the file lines
    lines = None

    # Location of gensim model that already has embedding matrix or location to save a new embedding matrix
    model_location = ''

    # gensim_model to be used
    gensim_model = None

    # Gensim_Maker object to be used
    gensim_maker_obj = None

    # the maximum number of words any one company's text is
    max_num_words = 0

    # the dictionary of a word and its one-hot encoding
    one_hot_dict = dict()

    def __init__(self, file_location, model_location, gensim_model = None, need_to_create_model = True):
        self.file_location = file_location
        self.model_location = model_location
        self.gensim_maker_obj = GensimMaker(file_location)
        if (need_to_create_model):
            self.gensim_maker_obj.generate_model()
            self.gensim_maker_obj.save_model(model_location)
        else:
            self.gensim_maker_obj.load_model(model_location)
        if gensim_model is None:
            self.gensim_model = self.gensim_maker_obj.get_model()

        with open(self.file_location) as file:
            self.lines = file.readlines()

        company_name_set = set()

        # get the maximum number of words in any file in the directory and make a list of company names
        max_length = float('inf') * -1
        for line in self.lines:
            line_split_list = line.split('\t')
            company_name = line_split_list[0]
            text_body = line_split_list[1]
            length = len(text_body.split())
            if length > max_length:
                max_length = length
            self.max_num_words = max_length
            company_name_set.add(company_name)
        self.one_hot_dict = self.create_one_hot_encoding_dictionary(company_name_set)

    # Converts line of a file to an array and pad it to a certain length
    def convert_line_of_file_to_array(self, line_number):
        num_dim_of_a_word = self.gensim_maker_obj.get_dimension_of_a_word()
        line = self.lines[line_number]
        text_body = line.split('\t')[1]
        big_array = np.zeros((num_dim_of_a_word, self.max_num_words))
        word_index = 0
        for word in text_body.split():
            try:
                new_word = self.gensim_model[word]
            # if a word isn't in the model then just leave it as a 0 vector
            except KeyError:
                word_index += 1
                continue
            big_array[:, word_index] = new_word
            word_index += 1
        big_array.view('float32')
        return big_array

    # creates the input matrix by making an array out of each file and stacking them on top of each other
    def create_input_matrix(self, list_of_line_numbers):
        matrix_to_return = np.empty((len(list_of_line_numbers), self.gensim_maker_obj.get_dimension_of_a_word(), self.max_num_words, 1))
        matrix_depth_index = 0
        for line_number in list_of_line_numbers:
            matrix_to_add = self.convert_line_of_file_to_array(line_number)
            matrix_to_return[matrix_depth_index, :, :, 0] = matrix_to_add
            matrix_depth_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # creates the correct answer matrix by vertically stacking the word vectors of the input list of answers
    def create_answer_matrix(self, list_of_line_numbers):
        matrix_to_return = np.empty((len(list_of_line_numbers), len(self.one_hot_dict)))
        matrix_row_index = 0
        for line_number in list_of_line_numbers:
            line = self.lines[line_number]
            company_name = line.split('\t')[0]
            matrix_to_return[matrix_row_index, :] = self.one_hot_dict[company_name]
            matrix_row_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # creates the dictionary for the one hot vectors
    def create_one_hot_encoding_dictionary(self, list_of_words):
        vector_length = len(list_of_words)
        dictionary = dict()
        index = 0
        for name in list_of_words:
            vector = np.zeros((1, vector_length))
            vector.view('float32')
            vector[0, index] = 1
            dictionary[name] = vector
            index += 1
        return dictionary

    # returns a list of training set, validation set, and test set as 4d numpy arrays as a list in the following order:
    # training input, training labels, validation input, validation labels, test input, test labels.
    # input is a list of 3 numbers which are the ratios of the training, validation, and test set, in order
    def get_data_set(self, ratios_list):
        num_lines_in_file = len(self.lines)
        num_lines_for_train_set = int(ratios_list[0] * num_lines_in_file)
        num_lines_for_valid_set = int(ratios_list[1] * num_lines_in_file)
        num_lines_for_test_set = num_lines_in_file - num_lines_for_train_set - num_lines_for_valid_set

        train_set_lines_list = list()
        valid_set_lines_list = list()
        test_set_lines_list = list()

        lines_list = list(range(num_lines_in_file))
        for count in range(num_lines_for_train_set):
            random_line = random.choice(lines_list)
            train_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_valid_set):
            random_line = random.choice(lines_list)
            valid_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_test_set):
            random_line = random.choice(lines_list)
            test_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        return [self.create_input_matrix(train_set_lines_list), self.create_answer_matrix(train_set_lines_list),
                self.create_input_matrix(valid_set_lines_list), self.create_answer_matrix(valid_set_lines_list),
                self.create_input_matrix(test_set_lines_list), self.create_answer_matrix(test_set_lines_list)]

    def get_data_set_given_line_numbers(self, list_of_line_numbers):

        train_set_lines_list = list_of_line_numbers[0]
        valid_set_lines_list = list_of_line_numbers[1]
        test_set_lines_list = list_of_line_numbers[2]

        return [self.create_input_matrix(train_set_lines_list), self.create_answer_matrix(train_set_lines_list),
                self.create_input_matrix(valid_set_lines_list), self.create_answer_matrix(valid_set_lines_list),
                self.create_input_matrix(test_set_lines_list), self.create_answer_matrix(test_set_lines_list)]

    def get_random_line_numbers(self, ratios_list):
        num_lines_in_file = len(self.lines)
        num_lines_for_train_set = int(ratios_list[0] * num_lines_in_file)
        num_lines_for_valid_set = int(ratios_list[1] * num_lines_in_file)
        num_lines_for_test_set = num_lines_in_file - num_lines_for_train_set - num_lines_for_valid_set

        train_set_lines_list = list()
        valid_set_lines_list = list()
        test_set_lines_list = list()

        lines_list = list(range(num_lines_in_file))
        for count in range(num_lines_for_train_set):
            random_line = random.choice(lines_list)
            train_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_valid_set):
            random_line = random.choice(lines_list)
            valid_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_test_set):
            random_line = random.choice(lines_list)
            test_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        return [train_set_lines_list, valid_set_lines_list, test_set_lines_list]

    # removes the numbers and extension '.txt' from a string
    def remove_numbers_and_extension(self, word):
        output = word.replace('.txt', '')
        output = re.sub('([0-9]*)', '', output)
        return output
class SQLPreprocessor(object):

    # database location where data is gotten
    db_name = ''

    # name of the table to retrieve patents
    table_for_retrieving = ''

    # table where new data is written
    table_for_storing_info = ''

    # Location of gensim model that already has embedding matrix or location to save a new embedding matrix
    model_location = ''

    # gensim_model to be used
    gensim_model = None

    # Gensim_Maker object to be used
    gensim_maker_obj = None

    # the maximum number of words any one patent's text is
    max_num_words = 0

    # dimension of any word vector in the model
    word_vector_dimension = 0

    def __init__(self,
                 db_name,
                 table_for_retrieving,
                 model_location,
                 table_for_storing_info,
                 gensim_maker_obj=None,
                 need_to_make_table=False,
                 gensim_model=None,
                 need_to_create_models=True):
        self.db_name = db_name
        self.model_location = model_location
        # To protect against injection nonsense
        self.table_for_retrieving = self.remove_non_alphachars(
            table_for_retrieving)
        self.table_for_storing_info = self.remove_non_alphachars(
            table_for_storing_info)
        if gensim_maker_obj is None:
            self.gensim_maker_obj = GensimMaker(
                db_name,
                sql_table_name=table_for_retrieving,
                use_SQL_sentence_maker=True)
        else:
            self.gensim_maker_obj = gensim_maker_obj
        if need_to_create_models:
            self.gensim_maker_obj.generate_model()
            self.gensim_maker_obj.save_model(model_location)
        else:
            self.gensim_maker_obj.load_model(model_location)
        if gensim_model is None:
            self.gensim_model = self.gensim_maker_obj.get_model()

        if need_to_make_table:
            conn = sqlite3.connect(self.db_name)

            conn.execute(
                '''CREATE TABLE ''' + self.table_for_storing_info +
                ''' (PATENT_ID TEXT NOT NULL, FIELD_DATA TEXT NOT NULL);''')
            conn.close()
            self.generate_organized_db()

        self.word_vector_dimension = self.gensim_maker_obj.get_dimension_of_a_word(
        )

        self.max_num_words = self.find_max_num_words()

    # gets the maximum number of words in any file in the directory and make a list of company names
    def find_max_num_words(self):
        max_length = float('inf') * -1
        conn = sqlite3.connect(self.db_name)
        all_patents = conn.execute("SELECT * FROM " +
                                   self.table_for_storing_info + ";")
        for line in all_patents:
            if '#!%$^&' in str(line[1]):
                curr_length = len(str(line[1]).split('#!%$^&'))
            else:
                curr_length = len(str(line[1]).split())
            if curr_length > max_length:
                max_length = curr_length
        conn.close()
        return max_length

    def remove_non_alphachars(self, string_to_clean):
        return ''.join(char for char in string_to_clean
                       if char.isalnum() or char is '_')

    # Takes the db and makes a new one with patents organized by patent name in one column and field value in the other
    def generate_organized_db(self):

        conn = sqlite3.connect(self.db_name)
        all_patents = conn.execute("SELECT * FROM " +
                                   self.table_for_retrieving)

        line_to_write = ''
        current_patent_id = 'PLACEHOLDER'
        for line in all_patents:
            patent_id = str(line[0])
            patent_field_value = str(line[1].replace('\n', ''))
            if current_patent_id == 'PLACEHOLDER':
                current_patent_id = patent_id
                line_to_write += patent_field_value
                continue
            if patent_id != current_patent_id:
                params = (current_patent_id, line_to_write)
                conn.execute(
                    "INSERT INTO " + self.table_for_storing_info +
                    " VALUES (?, ?);", params)
                current_patent_id = patent_id
                line_to_write = patent_field_value
            else:
                current_patent_id = patent_id
                line_to_write += '#!%$^&' + patent_field_value
        # so last line is not ignored
        params = (current_patent_id, line_to_write)
        conn.execute(
            "INSERT INTO " + self.table_for_storing_info + " VALUES (?, ?);",
            params)
        conn.commit()
        conn.close()

    # Converts the row of the generated db to an array and pad it to the maximum length text string in the old table
    def convert_row_of_db_to_array(self, line_number):
        num_dim_of_a_word = self.gensim_maker_obj.get_dimension_of_a_word()
        conn = sqlite3.connect(self.db_name)
        params = (str(line_number), )
        query = conn.execute(
            "SELECT * FROM " + self.table_for_storing_info +
            " LIMIT 1 OFFSET ?;", params)
        line = ''
        for row in query:
            line = str(row[1])
        big_array = np.zeros((num_dim_of_a_word, self.max_num_words))
        word_index = 0
        if '#!%$^&' in line:
            for word in line.split('#!%$^&'):
                try:
                    new_word = self.gensim_model[word]
                # if a word isn't in the model then just leave it as a 0 vector
                except KeyError:
                    word_index += 1
                    continue
                big_array[:, word_index] = new_word
                word_index += 1
            big_array.view('float32')
            conn.close()
            return big_array
        else:
            for word in line.replace('.', '').split():
                try:
                    new_word = self.gensim_model[word]
                # if a word isn't in the model then just leave it as a 0 vector
                except KeyError:
                    word_index += 1
                    continue
                big_array[:, word_index] = new_word
                word_index += 1
            big_array.view('float32')
            conn.close()
            return big_array

    # creates the input matrix by making an array out of the text at that row in the generated db and stacks them on top of each other
    def create_input_matrix(self, list_of_line_numbers):
        matrix_to_return = np.empty(
            (len(list_of_line_numbers),
             self.gensim_maker_obj.get_dimension_of_a_word(),
             self.max_num_words, 1))
        matrix_depth_index = 0
        for line_number in list_of_line_numbers:
            matrix_to_add = self.convert_row_of_db_to_array(line_number)
            matrix_to_return[matrix_depth_index, :, :, 0] = matrix_to_add
            matrix_depth_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # returns a list of training set, validation set, and test set as 4d numpy arrays as a list in the following order:
    # training input, validation input, test input.
    # input is a list of 3 numbers which are the ratios of the training, validation, and test set, in order
    def get_input_set(self, ratios_list):
        random_line_numbers_list = self.get_random_line_numbers(ratios_list)
        return self.get_input_set_given_line_numbers(random_line_numbers_list)

    def get_input_set_given_line_numbers(self, list_of_line_numbers):

        train_set_lines_list = list_of_line_numbers[0]
        valid_set_lines_list = list_of_line_numbers[1]
        test_set_lines_list = list_of_line_numbers[2]

        return [
            self.create_input_matrix(train_set_lines_list),
            self.create_input_matrix(valid_set_lines_list),
            self.create_input_matrix(test_set_lines_list)
        ]

    # given ratios for training, validation, and test data, returns 3 lists each containing random line numbers from
    # the organized table assuming line 0 is the first row
    def get_random_line_numbers(self, ratios_list):
        conn = sqlite3.connect(self.db_name)
        count_lines_in_file = conn.execute("SELECT COUNT(*) FROM " +
                                           self.table_for_storing_info + ";")
        num_lines_in_file = -1
        for num in count_lines_in_file:
            num_lines_in_file = num[0]
        num_lines_for_train_set = int(ratios_list[0] * num_lines_in_file)
        num_lines_for_valid_set = int(ratios_list[1] * num_lines_in_file)
        num_lines_for_test_set = num_lines_in_file - num_lines_for_train_set - num_lines_for_valid_set

        train_set_lines_list = list()
        valid_set_lines_list = list()
        test_set_lines_list = list()

        lines_list = list(range(0, num_lines_in_file))
        for count in range(num_lines_for_train_set):
            random_line = random.choice(lines_list)
            train_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_valid_set):
            random_line = random.choice(lines_list)
            valid_set_lines_list.append(random_line)
            lines_list.remove(random_line)

        for count in range(num_lines_for_test_set):
            random_line = random.choice(lines_list)
            test_set_lines_list.append(random_line)
            lines_list.remove(random_line)
        conn.close()

        return [
            train_set_lines_list, valid_set_lines_list, test_set_lines_list
        ]

    # returns a list of patent ids given a list of line numbers in the organized patent file, assuming first row is
    # row 0
    def get_patent_ids_from_line_numbers(self, line_numbers_list):
        conn = sqlite3.connect(self.db_name)
        patent_id_list = list()

        for line in line_numbers_list:
            params = (str(line), )
            query = conn.execute(
                "SELECT * FROM " + self.table_for_storing_info +
                " LIMIT 1 OFFSET ?;", params)
            for row in query:
                patent_id_list.append(str(row[0]))
        conn.close()
        return patent_id_list
class SQLPreprocessorVersion2(object):

    # database location where data is gotten
    db_name = ''

    # name of the table where data is kept
    table_name = ''

    # the name of the column where the data is stored
    column_name = ''

    # location of gensim model that already has embedding matrix or location to save a new embedding matrix
    model_location = ''

    # the delimiter used
    delimiter = ''

    # gensim_model to be used
    gensim_model = None

    # Gensim_Maker object to be used
    gensim_maker_obj = None

    # the maximum number of words any one patent's text is
    max_num_words = 0

    # dimension of any word vector in the model
    word_vector_dimension = 0

    def __init__(self, db_name, table_name, model_location, column_name, gensim_maker_obj=None, delimiter = '',
                 gensim_model=None, need_to_create_models=True):
        self.db_name = db_name
        self.model_location = model_location
        # To protect against injection nonsense
        self.table_name = self.remove_non_alphachars(table_name)
        self.column_name = self.remove_non_alphachars(column_name)
        self.delimiter = delimiter
        if gensim_maker_obj is None:
            self.gensim_maker_obj = GensimMaker(db_name, sql_table_name=table_name, use_SQL_sentence_maker=True)
        else:
            self.gensim_maker_obj = gensim_maker_obj
        if need_to_create_models:
            self.gensim_maker_obj.generate_model()
            self.gensim_maker_obj.save_model(model_location)
        else:
            self.gensim_maker_obj.load_model(model_location)
        if gensim_model is None:
            self.gensim_model = self.gensim_maker_obj.get_model()

        self.word_vector_dimension = self.gensim_maker_obj.get_dimension_of_a_word()

        self.max_num_words = self.find_max_num_words()

    # gets the maximum number of words in any file in the directory and make a list of company names
    def find_max_num_words(self):
        max_length = float('inf') * -1
        conn = sqlite3.connect(self.db_name)
        all_patents = conn.execute("SELECT " + self.column_name + " FROM " + self.table_name + ";")
        for line in all_patents:
            if self.delimiter in str(line[0]):
                curr_length = len(str(line[0]).split(self.delimiter))
            else:
                curr_length = len(str(line[0]).split())
            if curr_length > max_length:
                max_length = curr_length
        conn.close()
        return max_length

    def remove_non_alphachars(self, string_to_clean):
            return ''.join(char for char in string_to_clean if char.isalnum()or char is '_')

    # Converts the patent to an array and pad it to the maximum length text string of any patent's data
    def convert_patent_to_array(self, patent_id):
        num_dim_of_a_word = self.gensim_maker_obj.get_dimension_of_a_word()
        conn = sqlite3.connect(self.db_name)
        query = conn.execute("SELECT " + self.column_name + " FROM " + self.table_name + " WHERE patent_id =\"" + patent_id + "\"")
        line = ''
        for row in query:
            line = str(row[0])
        big_array = np.zeros((num_dim_of_a_word, self.max_num_words))
        word_index = 0
        if self.delimiter in line:
            for word in line.split(self.delimiter):
                try:
                    new_word = self.gensim_model[word]
                # if a word isn't in the model then just leave it as a 0 vector
                except KeyError:
                    word_index += 1
                    continue
                big_array[:, word_index] = new_word
                word_index += 1
            big_array.view('float32')
            conn.close()
            return big_array
        else:
            for word in line.replace('.', '').split():
                try:
                    new_word = self.gensim_model[word]
                # if a word isn't in the model then just leave it as a 0 vector
                except KeyError:
                    word_index += 1
                    continue
                big_array[:, word_index] = new_word
                word_index += 1
            big_array.view('float32')
            conn.close()
            return big_array

    # creates the input matrix by making an array out of the text at that row in the generated db and stacks them on top of each other
    def create_input_matrix(self, list_of_patent_ids):
        matrix_to_return = np.empty((len(list_of_patent_ids), self.gensim_maker_obj.get_dimension_of_a_word(), self.max_num_words, 1))
        matrix_depth_index = 0
        for patent in list_of_patent_ids:
            matrix_to_add = self.convert_patent_to_array(patent)
            matrix_to_return[matrix_depth_index, :, :, 0] = matrix_to_add
            matrix_depth_index += 1
        matrix_to_return.view('float32')
        return matrix_to_return

    # returns a list of training set, validation set, and test set as 4d numpy arrays as a list in the following order:
    # training input, validation input, test input.
    # input is a list of 3 numbers which are the ratios of the training, validation, and test set, in order
    def get_input_set(self, ratios_list):
        random_patents_lists = self.get_random_patent_ids(ratios_list)
        return self.get_input_set_given_patent_ids(random_patents_lists)

    def get_input_set_given_patent_ids(self, list_of_patent_ids):

        train_set_patent_list = list_of_patent_ids[0]
        valid_set_patent_list = list_of_patent_ids[1]
        test_set_patent_list = list_of_patent_ids[2]

        return [self.create_input_matrix(train_set_patent_list),
                self.create_input_matrix(valid_set_patent_list),
                self.create_input_matrix(test_set_patent_list)]

    # given ratios for training, validation, and test data, returns 3 lists each containing random patent ids
    def get_random_patent_ids(self, ratios_list):
        conn = sqlite3.connect(self.db_name)
        query = conn.execute("SELECT * FROM " + self.table_name + ";")
        patent_id_list = list()
        for row in query:
            patent_id_list.append(str(row[0]))

        num_patents_in_file = len(patent_id_list)

        num_patents_for_train_set = int(ratios_list[0] * num_patents_in_file)
        num_patents_for_valid_set = int(ratios_list[1] * num_patents_in_file)
        if (ratios_list[0] + ratios_list[1] + ratios_list[2] == 1):
            num_patents_for_test_set = num_patents_in_file - num_patents_for_train_set - num_patents_for_valid_set
        else:
            num_patents_for_test_set = int(ratios_list[2] * num_patents_in_file)

        train_set_patents_list = list()
        valid_set_patents_list = list()
        test_set_patents_list = list()

        for count in range(num_patents_for_train_set):
            random_patent = random.choice(patent_id_list)
            train_set_patents_list.append(random_patent)
            patent_id_list.remove(random_patent)

        for count in range(num_patents_for_valid_set):
            random_patent = random.choice(patent_id_list)
            valid_set_patents_list.append(random_patent)
            patent_id_list.remove(random_patent)

        for count in range(num_patents_for_test_set):
            random_patent = random.choice(patent_id_list)
            test_set_patents_list.append(random_patent)
            patent_id_list.remove(random_patent)

        conn.close()

        return [train_set_patents_list, valid_set_patents_list, test_set_patents_list]