コード例 #1
0
ファイル: picktab.py プロジェクト: c0ntradicti0n/PickTab
 def miss_out_logo_in_corner(self, table, logo_string, cell_coords=(-1, -1)):
     ''' right bottom corner often you can read the logo: 'differencebetween.net with a wrong detected  tab border,
     delete it by fuzzy matching on the end of the string. '''
     right_bottom = table[cell_coords[0]][cell_coords[1]]
     if ratio(right_bottom[-30:], logo_string) > 0.5:
         ratios = [ratio(right_bottom[-r:], logo_string) for r in range(35)]
         element_to_key = lambda i: ratios[i]
         arg_max_ratios = max(range(len(ratios)), key=element_to_key) + 1
         table[cell_coords[0]][cell_coords[1]] = right_bottom.replace(right_bottom[-arg_max_ratios:], "").strip()
         return table
     else:
         return table
コード例 #2
0
 def clean_duplicates(df):
     maxes = {}
     for i, row in df.iterrows():
         lev = _levenshtein.ratio(row.title_x, row.title_y)
         if row.title_x not in maxes:
             maxes[row.title_x] = (i, lev)
         else:
             if lev > maxes[row.title_x][1]:
                 maxes[row.title_x] = (i, lev)
     maxes = [val[0] for val in maxes.values()]
     indexes = df.index.values.tolist()
     return list(set(indexes) - set(maxes))
コード例 #3
0
ファイル: picktab.py プロジェクト: c0ntradicti0n/PickTab
    def run_picktab(self, path, gold):
        res = self.t.process(path, logo_string='[Dap| DifferenceBetween.net')
        import pprint
        pprint.pprint(res)

        for y, r in enumerate(res):
            for x, c in enumerate(r):
                try:
                    self.assertGreater(ratio(res[y][x].lower(), gold[y][x].lower()), 0.96)
                except:
                    print(res[y][x], " <!=> ", gold[y][x])
                    raise
コード例 #4
0
 def levenshtein_comparison(results_df: pd.DataFrame):
     print("Size of result_df: {}".format(results_df.shape[0]))
     cashtags_not_found = self.not_found['title'].tolist()
     results_df['keep'] = False
     match_count = 0
     for i, row in results_df.iterrows():
         curr_title = row['title'].lower()
         ratios = {}
         for tag_not_found in cashtags_not_found:
             # lev = difflib.SequenceMatcher(None, tag, title).ratio()
             lev = _levenshtein.ratio(curr_title, tag_not_found)
             ratios[tag_not_found] = lev
         matched_string = max(ratios, key=lambda key: ratios[key])
         matched_ratio = ratios[matched_string]
         if matched_ratio == 1.0:
             results_df.at[i, 'keep'], match_count = True, match_count+1
     results_df = results_df[results_df.keep]
     results_df = results_df.drop(columns=['keep'])
     print("Size of result_df: {}".format(results_df.shape[0]))
     return results_df
コード例 #5
0
 def levenshtein_comparison(df):
     results = pd.DataFrame()
     tickers = pd.read_csv('./utils/secwiki_tickers.csv')
     rows = {}
     for _, row in df.iterrows():
         if row.key == "NASDAQ|FBMS": continue
         match_col = tickers.loc[tickers['Ticker'] == row.symbol]
         if not match_col.empty and not pd.isna(match_col.Name.values[0]):
             lev = _levenshtein.ratio(row.title, match_col.Name.values[0])
             if lev > 0.55:
                 if row.key in rows and lev > rows[row.key][1]:
                     rows[row.key] = [row, lev]
                 else:
                     rows[row.key] = [row,lev]
                 print(row.dbpDescription[:50], row.key, row.title, colored("MATCH", "blue"), match_col.Name.values[0], match_col.Ticker.values[0], lev)
     for _, v in rows.items():
         results = results.append(v[0], ignore_index = True)
     #print(results[['dbpDescription', 'exchange', 'key', 'symbol', 'title']])
     sys.exit(0)
     return results
コード例 #6
0
ファイル: cnn_test.py プロジェクト: HYUNMIN-KIM/flask_start
def jaro(sentence, query):
    score = _levenshtein.ratio(sentence, query)
    return score
コード例 #7
0
def compare(term, name):
    return ratio(simplify(term), simplify(name))