Beispiel #1
0
 def get_cost(self, word_1: str, word_2: str):
     return osa(word_1.lower(), word_2.lower(),
                transpose_costs=self.transpose_costs,
                substitute_costs=self.substitute_costs,
                insert_costs=self.insert_costs,
                delete_costs=self.delete_costs,
                )
Beispiel #2
0
 def get_cost(self, word_1: str, word_2: str):
     return osa(
         word_1.lower(),
         word_2.lower(),
         transpose_costs=self.transpose_costs,
         substitute_costs=self.substitute_costs,
         insert_costs=self.insert_costs,
         delete_costs=self.delete_costs,
     )
Beispiel #3
0
 def test_osa(self):
     self.assertEqual(osa('1234', '1234'), 0.0)
     self.assertEqual(osa('', '1234'), 4.0)
     self.assertEqual(osa('1234', ''), 4.0)
     self.assertEqual(osa('', ''), 0.0)
     self.assertEqual(osa('1234', '12'), 2.0)
     self.assertEqual(osa('1234', '14'), 2.0)
     self.assertEqual(osa('1111', '1'), 3.0)
Beispiel #4
0
 def test_osa(self):
     self.assertEqual(osa('1234', '1234'), 0.0)
     self.assertEqual(osa('', '1234'), 4.0)
     self.assertEqual(osa('1234', ''), 4.0)
     self.assertEqual(osa('', ''), 0.0)
     self.assertEqual(osa('1234', '12'), 2.0)
     self.assertEqual(osa('1234', '14'), 2.0)
     self.assertEqual(osa('1111', '1'), 3.0)
def find_closest_string_weighted(string):
    print("string: " + string)

    with open('../approaches/edit_distance/cleaned_bucket_data.json',
              encoding="ASCII") as f:
        data = json.load(f)

    # find first letter of every word in the string
    words = string.split()
    letters = [word[0] for word in words]

    # get corresponding buckets
    first_letter = string[0]
    products = []
    for bucket in data:
        if bucket[0][0] == first_letter:
            products += bucket

    # remove non-ascii characters
    cleaned_products = []
    for entry in products:
        cleaned_entry = ""
        for character in entry:
            if ord(character) <= 128:
                cleaned_entry += character
        cleaned_products.append(cleaned_entry)

    insert_costs = np.full(
        128, .3, dtype=np.float64
    )  # make an array of all 1's of size 128, the number of ASCII characters
    transpose_costs = np.full((128, 128), .7, dtype=np.float64)
    delete_costs = np.full(128, 1.2, dtype=np.float64)

    closest_distance = 999999
    closest_string = None

    for line in cleaned_products:
        distance = osa(string.lower(),
                       line.lower(),
                       insert_costs=insert_costs,
                       transpose_costs=transpose_costs,
                       delete_costs=delete_costs)
        if closest_distance is None or distance < closest_distance:
            closest_distance = distance
            closest_string = line.lower()
    print("closest_string: " + closest_string)
    return closest_string
Beispiel #6
0
def predict(searched=None,
            internal_call=False,
            receipt=None,
            receipt_titles=None):
    '''
	important variables:
	tokenizer - tokenizer for tokenizing text
	lstm_model - trained lstm model
	le - label encoder to decode output
	'''

    # lstm
    K.clear_session()
    lstm_path = "../approaches/LSTM/"
    with open(lstm_path + "pickled/tokenizer_300k_1epoch.pickle",
              'rb') as handle:
        tokenizer = pickle.load(handle)
    lstm_model = load_model(lstm_path + "models/lstm_300k_epochs_1.h5")
    le = preprocessing.LabelEncoder()
    le.classes_ = np.load(lstm_path +
                          "pickled/labelencoder_classes_300k_1epoch.npy")
    tokenizer.oov_token = None

    # edit distance
    with open('../approaches/edit_distance/cleaned_bucket_data.json',
              encoding="ASCII") as f:
        data = json.load(f)

    # gbdt
    gbdt_path = "../approaches/gbdt/"
    gbdt_model = pickle.load(open(gbdt_path + "models/gbdt_model.sav", "rb"))
    gbdt_le = preprocessing.LabelEncoder()
    gbdt_le.classes_ = np.load(gbdt_path +
                               "pickled/labelencoder_gbdt_classes.npy")

    ## if internal call ##
    if internal_call:
        # lstm
        encoded_x = tokenizer.texts_to_sequences([searched])
        padded = pad_sequences(encoded_x, 25)
        lstm_preds = lstm_model.predict(padded)
        pred_labels = [[np.argmax(x)] for x in lstm_preds]
        lstm_preds = le.inverse_transform(pred_labels)

        # gbdt
        df = get_dataframe([searched.upper()])
        gbdt_preds = gbdt_model.predict(df.drop(columns=['x', 'y'], axis=1))
        gbdt_preds = gbdt_le.inverse_transform(gbdt_preds)[0]

        #=========================================================

        # edit distance

        # find first letter of every word in the string
        words = searched.split()
        letters = [word[0] for word in words]

        # get corresponding buckets
        first_letter = searched[0]
        products = []
        for bucket in data:
            if bucket[0][0].lower() == first_letter.lower():
                products += bucket

        # remove non-ascii characters
        cleaned_products = []
        for entry in products:
            cleaned_entry = ""
            for character in entry:
                if ord(character) <= 128:
                    cleaned_entry += character
            cleaned_products.append(cleaned_entry)

        insert_costs = np.full(
            128, .3, dtype=np.float64
        )  # make an array of all 1's of size 128, the number of ASCII characters
        transpose_costs = np.full((128, 128), .7, dtype=np.float64)
        delete_costs = np.full(128, 1.2, dtype=np.float64)

        closest_distance = 999999
        closest_string = None

        for line in cleaned_products:
            distance = osa(searched.lower(),
                           line.lower(),
                           insert_costs=insert_costs,
                           transpose_costs=transpose_costs,
                           delete_costs=delete_costs)
            if closest_distance is None or distance < closest_distance:
                closest_distance = distance
                closest_string = line.lower()

        edit_distance_pred = closest_string
        return (lstm_preds, edit_distance_pred, gbdt_preds)
    else:
        print('receipt titles = ', receipt_titles)
        encoded_x = tokenizer.texts_to_sequences(receipt_titles)
        padded = pad_sequences(encoded_x, 25)
        lstm_preds = lstm_model.predict(padded)
        pred_labels = [[np.argmax(x)] for x in lstm_preds]
        lstm_preds = le.inverse_transform(pred_labels)

        # gbdt
        titles = []
        for title in receipt_titles:
            titles.append(title.upper())
        df = get_dataframe(titles)
        gbdt_preds = gbdt_model.predict(df.drop(columns=['x', 'y'], axis=1))
        gbdt_preds = gbdt_le.inverse_transform(gbdt_preds)

        #=========================================================

        # edit distance
        edit_distance_preds = []
        for title in receipt_titles[:]:

            # find first letter of every word in the string
            words = title.split()
            letters = [word[0] for word in words]

            # get corresponding buckets
            first_letter = title[0]
            products = []
            for bucket in data:
                if bucket[0][0] == first_letter:
                    products += bucket

            # remove non-ascii characters
            cleaned_products = []
            for entry in products:
                cleaned_entry = ""
                for character in entry:
                    if ord(character) <= 128:
                        cleaned_entry += character
                cleaned_products.append(cleaned_entry)

            insert_costs = np.full(
                128, .3, dtype=np.float64
            )  # make an array of all 1's of size 128, the number of ASCII characters
            transpose_costs = np.full((128, 128), .7, dtype=np.float64)
            delete_costs = np.full(128, 1.2, dtype=np.float64)

            closest_distance = 999999
            closest_string = None

            for line in cleaned_products:
                distance = osa(title.lower(),
                               line.lower(),
                               insert_costs=insert_costs,
                               transpose_costs=transpose_costs,
                               delete_costs=delete_costs)
                if closest_distance is None or distance < closest_distance:
                    closest_distance = distance
                    closest_string = line.lower()

            edit_distance_preds.append(closest_string)

        global manual_search
        global text
        global receipt_titles_global
        global lstm_output
        global edit_distance_output
        global gbdt_output

        manual_search = False
        text = receipt
        receipt_titles_global = receipt_titles
        lstm_output = lstm_preds
        edit_distance_output = edit_distance_preds
        gbdt_output = gbdt_preds
        print("edit_distance_output " + str(edit_distance_output))

        return render_template("index.html",
                               manual_search=False,
                               text=receipt,
                               receipt_titles=receipt_titles,
                               lstm_output=lstm_preds,
                               edit_distance_output=edit_distance_preds,
                               gbdt_output=gbdt_preds)
Beispiel #7
0
 def _osa(self, x, y):
     return osa(x, y, self.iw, self.dw, self.sw, self.tw)
Beispiel #8
0
 def _osa(self, x, y):
     return osa(x, y, self.iw, self.dw, self.sw, self.tw)