Example #1
0
 def do_col_merge(self, *args):
     app = App.get_running_app()
     backend = app.backend
     colm = self.merge_popup.content
     if len(colm.right_buttons) < 2:
         _popup = ErrorMsg(
             error_text='Choose at least 2 columns to merge together.')
         _popup.open()
         return
     col = colm.ids.new_col_text.text
     if col == '':
         _popup = ErrorMsg(
             error_text=
             'Enter a name for the merged column that will be created.')
         _popup.open()
         return
     try:
         if colm.which == 1:
             df = backend.grouper_helper.df1
         else:
             df = backend.grouper_helper.df2
         col_out = backend.merge_cols(df,
                                      [b.text for b in colm.right_buttons])
         new_col_name = col
         ndupe = 1
         while any(df.columns == new_col_name):
             new_col_name = col + '.' + str(ndupe)
             ndupe += 1
             if ndupe > 5000:
                 raise Exception(
                     'Maybe try a different name for the merged column?')
         df.insert(0, new_col_name, col_out)
         if colm.which == 1:
             backend.columns1 = df.columns
             self.populate_dropdown1()
         else:
             backend.columns2 = df.columns
             self.populate_dropdown2()
         app.panels['alsocompare_screen'].reset_panel()
         app.panels['append_screen'].populate()
         self.merge_popup.dismiss()
     except Exception as error:
         self.merge_popup.dismiss()
         error_type = str(type(error)).split('\'')[1]
         error_msg = error
         _popup = ErrorMsg(
             error_text='Error creating new column: {}. {}'.format(
                 error_type, error_msg))
         _popup.open()
Example #2
0
class LoadPanel( BoxLayout):
    '''
    high-level load panel, contains FileSection widgets and big load button
    '''
    file_sections_box = ObjectProperty(None)
    load_btn = ObjectProperty(None)
    opts_btn = ObjectProperty(None)
    def __init__( self, **kwargs):
        super( LoadPanel, self).__init__( **kwargs)
        
        self.file_section1 = FileSection( text='Populate to:', default_dataset_name='File1')
        self.file_section2 = FileSection( text='Populate from:', default_dataset_name='File2')
        
        self.file_sections_box.add_widget( self.file_section1)
        self.file_sections_box.add_widget( self.file_section2)
        
        self.load_btn.bind( on_release=self.load_callback)
        self.opts_btn.bind( on_release=self.opts_callback)
        
        # set focus to the first textinput
        self.file_section1.set_focus()
        
    def opts_callback( self, *args):
        App.get_running_app().nav_to( 'searchconfig_screen', 'right')
        
    def show_help( self):
        help_text = '- In SQL terminology, this software implements a "left join" on one or more inexact keys.\n\n- Use the [b]"Browse"[/b] buttons to select the two spreadsheets you want to match. When prompted, give each sheet a name.\n\n- The [b]"Populate to"[/b] sheet is the one you want to populate with matches. In SQL terminology it is the "left" table. The match output will contain one (or optionally more) match result for each row in this sheet. [b]This is typically the sheet with the fewest rows.[/b]\n\n- The [b]"Populate from"[/b] sheet should contain the potential matches for the rows in the first sheet. In SQL terminology this is the "right" table. [u]Note:[/u] not every row in this sheet will necessarily populate into the first sheet, and rows from this sheet can populate more than once into the first one.\n'
        self._popup = HelpMsg( help_text, title='Help', size_hint=(0.92,0.92))
        self._popup.open()
        
    def show_about( self):
        about_text = 'This GUI and related code is written by Steve Suway. The GUI is built using Kivy. Matching computations are performed using string_grouper, which is written by Chris van den Berg. string_grouper\'s matching functionality builds upon sparse_dot_topn, which is an open-source project by ING Bank. ASCII transliteration is done using Unidecode, written by Tomaz Solc. Other packages this code relies on include NumPy, SciPy, and pandas.'
        self._popup = HelpMsg( about_text, title='About', size_hint=(0.75,0.60))
        self._popup.open()
        
    def load_callback( self, btn=None):
        file1 = self.file_section1.get_path()
        file2 = self.file_section2.get_path()
        if (file1 == '') | (file2 == ''):
            return
            
        self._popup = Popup( size=('425dp','150dp'), size_hint=(None,None), auto_dismiss=False, title='Please wait')
        self._popup.content = Label( text='Loading files... this can take a long time for large files, and \nthis window will stop responding until finished.')
        self._popup.open()
        
        # wait for popup to actually open before starting to load
        Clock.schedule_once( self.do_load, 0.1)
        
    def do_load( self, obj):
        app = App.get_running_app()
        
        app.backend.labels[0] = self.file_section1.dataset_name
        app.backend.labels[1] = self.file_section2.dataset_name
        
        if app.backend.labels[0] == app.backend.labels[1]:
            app.backend.labels[1] += '(2)'
        
        file1 = self.file_section1.get_path()
        file2 = self.file_section2.get_path()
            
        sep1 = self.file_section1.sep
        sep2 = self.file_section2.sep
        encoding1 = self.file_section1.encoding
        encoding2 = self.file_section2.encoding
        
        load_successful = app.backend.init_fast_match( file1, file2, sep1, sep2, encoding1, encoding2)
        
        self._popup.dismiss()
        
        if load_successful:
            app.panels['narrowby_screen'].populate_dropdowns()
            app.panels['alsocompare_screen'].reset_panel()
            app.panels['append_screen'].populate()
            app.nav_to( 'narrowby_screen', 'left')
        else:
            error_type = app.backend.grouper_helper.error_type
            error_msg  = app.backend.grouper_helper.error_msg
            if app.backend.grouper_helper.file1_load_successful:
                problem_file = app.backend.labels[1]
            else:
                problem_file = app.backend.labels[0]
            errtxt = 'Error loading {}.\n\n{}: {}'.format(problem_file, error_type, error_msg)
            if error_type == 'UnicodeDecodeError':
                errtxt += '\n\nThis probably means you selected the wrong encoding. Try re-loading your plaintext file and select a different encoding when prompted.'
            self._popup = ErrorMsg( error_text=errtxt)
            self._popup.open()
Example #3
0
class ExportMatchesPanel(BoxLayout):
    check_clip = ObjectProperty(None)
    check_file = ObjectProperty(None)
    check_space = ObjectProperty(None)
    check_sort = ObjectProperty(None)
    txt_nmatch = ObjectProperty(None)

    def __init__(self, **kwargs):
        super(ExportMatchesPanel, self).__init__(**kwargs)

        self.check_clip.bind(active=self.export_opts_callback)
        self.check_file.bind(active=self.export_opts_callback)

        self.sep = None
        self.encoding = None

    def nmatch_up(self):
        max = App.get_running_app().backend.max_n_matches
        if int(self.txt_nmatch.text) >= max:
            return
        self.txt_nmatch.text = str(int(self.txt_nmatch.text) + 1)
        self.check_space.disabled = False

    def nmatch_down(self):
        if self.txt_nmatch.text == '1':
            return
        self.txt_nmatch.text = str(int(self.txt_nmatch.text) - 1)
        if self.txt_nmatch.text == '1':
            self.check_space.disabled = True

    def show_help(self):
        help_text = '''- [u]Keep the top [i]n[/i] matches[/u]: choose how many alternate matches you want to review. If you only want the top-scoring match for each row, select \'1\'.\n\n- [u]Add spacer between groups of matches[/u]: if you export multiple alternate matches, this option adds a blank spacer row between separate groups of matches, which helps guide the eye during manual review.\n\n- [u]Sort rows by similarity[/u]: if enabled, rows will be sorted by match score. If disabled, the original row order from your input spreadsheet will be preserved.\n\n- [u]Copy matches to clipboard[/u]: choose this option if you want to paste the matches into Excel or Google Sheets.\n\n- [u]Save matches to file[/u]: choose this option if you want to save the matches to a spreadsheet file (.xlsx, .csv, or .txt).\n'''
        self._popup = HelpMsg(help_text, title='Help', size_hint=(0.9, 0.9))
        self._popup.open()

    def export_opts_callback(self, check, value):
        if (self.check_clip.active == False) & (self.check_file.active
                                                == False):
            check.active = True

    def match_opts_callback(self, check, value):
        if (self.check_all.active == False) & (self.check_best.active
                                               == False):
            check.active = True
        if self.check_all.active == True:
            self.check_space.disabled = False
        else:
            self.check_space.disabled = True

    def back_callback(self, ):
        app = App.get_running_app()
        app.backend.drop_appends()
        app.nav_to('append_screen', 'right')

    def prep_export(self, *args):
        backend = App.get_running_app().backend
        self.matches_for_export = backend.clean_matches_for_export(
            int(self.txt_nmatch.text),
            restore_row_order=(not self.check_sort.active),
            use_spacer=self.check_space.active)

    def export_callback(self):
        self._popup = Popup(size=('410dp', '150dp'),
                            size_hint=(None, None),
                            auto_dismiss=False,
                            title='Please wait')
        self._popup.content = Label(
            text=
            'Working... this can take a long time for large files, and \nthis window will stop responding until finished.'
        )
        self._popup.open()
        # wait for popup to actually open before starting to load
        Clock.schedule_once(self.export_really, 0.1)

    def export_really(self, obj):
        backend = App.get_running_app().backend

        if self.check_clip.active:
            self.prep_export()
            self.matches_for_export.to_clipboard(index=False)
            self._popup.dismiss()
        else:
            with TkSaveDialog('matches.xlsx',
                              [('Spreadsheet', '.xlsx .csv .txt')]) as dialog:
                filename = dialog.get_filename()
            self._popup.dismiss()
            if filename != '':
                self.save_export(filename)

    def save_export(self, filename):
        self.out_file = Path(filename)
        self.out_file = self.out_file.with_suffix(self.out_file.suffix.lower())
        self.plaintext_opts()

    def plaintext_opts(self, obj=None):

        if (self.out_file.suffix == '.csv') | (self.out_file.suffix == '.txt'):
            self._popup.dismiss()
            self._popup = Popup(size=('300dp', '350dp'),
                                size_hint=(None, None),
                                title='Select plaintext delimiter',
                                auto_dismiss=False)
            self._popup.content = PlaintextSepChooser(self._popup.dismiss,
                                                      self.delim_opts_next)
            self._popup.open()
        else:
            self.save_really()

    def delim_opts_next(self, obj):
        delim_choice = self._popup.content.get_delim_choice()
        if delim_choice == '':
            return
        else:
            self.sep = delim_choice
            self._popup.dismiss()
            self._popup = Popup(size=('450dp', '200dp'),
                                size_hint=(None, None),
                                title='Select plaintext encoding',
                                auto_dismiss=False)
            self._popup.content = PlaintextEncodingChooser(
                self._popup.dismiss, self.enc_opts_ok)
            self._popup.open()

    def enc_opts_ok(self, obj):
        enc_choice = self._popup.content.dropbtn.text
        if enc_choice == '[use default]':
            self.encoding = None
        else:
            self.encoding = enc_choice.split(' ')[0]
        self.save_really()

    def save_really(self):
        backend = App.get_running_app().backend
        try:
            if self.out_file.suffix == '.xlsx':
                self.prep_export()
                self.matches_for_export.to_excel(str(self.out_file),
                                                 index=False)
                self._popup.dismiss()

            elif (self.out_file.suffix == '.csv') | (self.out_file.suffix
                                                     == '.txt'):
                self.prep_export()
                self.matches_for_export.to_csv(str(self.out_file),
                                               index=False,
                                               sep=self.sep,
                                               encoding=self.encoding)
                self._popup.dismiss()

            else:
                raise Exception('Filetype must be .xlsx, .csv, or .txt')

        except Exception as error:
            self._popup.dismiss()
            error_type = str(type(error)).split('\'')[1]
            self._popup = ErrorMsg(
                error_text='Error saving {}.\n\n{}: {}'.format(
                    self.out_file.parts[-1], error_type, error))
            self._popup.open()
class SearchConfigPanel(BoxLayout):
    sim_input = ObjectProperty(None)
    nmatch_input = ObjectProperty(None)
    ngram_input = ObjectProperty(None)
    excl_input = ObjectProperty(None)
    check_case = ObjectProperty(None)
    check_amperland = ObjectProperty(None)
    check_unidecode = ObjectProperty(None)
    check_shortstr = ObjectProperty(None)
    check_whitesp = ObjectProperty(None)

    def __init__(self, **kwargs):
        super(SearchConfigPanel, self).__init__(**kwargs)

        self.backend = App.get_running_app().backend

        self.sim_input.text = str(self.backend.min_similarity)
        self.sim_input.bind(text=self.sim_input_updated)

        self.nmatch_input.text = str(self.backend.max_n_matches)
        self.nmatch_input.bind(text=self.nmatch_input_updated)

        self.ngram_input.text = str(self.backend.ngram_size)
        self.ngram_input.bind(text=self.ngram_input_updated)

        self.excl_input.text = self.backend.excl_chars

        self.check_amperland.active = self.backend.advanced_opts['amperland']
        self.check_unidecode.active = self.backend.advanced_opts['unidecode']
        self.check_shortstr.active = self.backend.advanced_opts['shortstr']
        self.check_case.active = self.backend.ignore_case
        self.check_whitesp.active = self.backend.ignore_whitesp

    def sim_input_updated(self, sim_input, text):
        if '-' in text:
            sim_input.text = text.replace('-', '')
        elif text == '':
            return
        elif float(text) > 1:
            sim_input.text = '1'

    def nmatch_input_updated(self, nmatch_input, text):
        if '-' in text:
            nmatch_input.text = text.replace('-', '')
        elif text == '':
            return
        elif int(text) < 1:
            nmatch_input.text = '1'

    def ngram_input_updated(self, ngram_input, text):
        if '-' in text:
            ngram_input.text = text.replace('-', '')
        elif text == '':
            return
        elif int(text) < 1:
            ngram_input.text = '1'

    def help_sim(self):
        help_text = 'Set the minimum match score allowed for potential matches. The valid range is between 0 and 1. Scores below this number will not populate into the output. It is generally recommend to keep this set to 0 and manually review matches with low scores.\n\n[b]Note[/b]: If an entry in the [b]Populate to[/b] sheet has no match in the [b]Populate from[/b] sheet with a score greater than this number, the entry will [i]not[/i] be dropped from the output. Rather, the entry will not be paired with any entry from the [b]Populate from[/b] sheet, and the match score for the entry will be left blank.\n'
        self._popup = HelpMsg(help_text,
                              title='Minimum match score',
                              size_hint=(0.75, 0.7))
        self._popup.open()

    def help_nmatch(self):
        help_text = 'Set the maximum number of matches to consider for each entry in the [b]Populate to[/b] sheet. Decreasing this can lead to more false negatives (dropped matches that are actually good). Increasing this can lead to more false positives (keeping matches that are not good). [b]For large datasets you may get better results if you increase this number.[/b]\n'
        self._popup = HelpMsg(help_text,
                              title='Maximum number of matches',
                              size_hint=(0.7, 0.6))
        self._popup.open()

    def help_ngram(self):
        help_text = 'The match algorithm converts character strings into a set of "features" based on small groups of sequential characters called "n-grams". The n-gram size sets the number of sequential characters to use for feature extraction. An n-gram size of 3 generally works well, but if all your words/strings are very short a size of 2 could be more accurate.\n'
        self._popup = HelpMsg(help_text,
                              title='N-gram size',
                              size_hint=(0.7, 0.6))
        self._popup.open()

    def help_excl(self):
        help_text = 'Provide a list of characters that should be ignored when computing match scores. These characters will be removed from each entry before matching.\n\n[b]Note1[/b]: Do not include any delimiters between characters in this list. Just type each character.\n\n[b]Note2[/b]: You cannot include multiple-character sequences. For example, if you are trying to exclude the sequence "com", this will remove all c\'s, o\'s, and m\'s individually.\n\n[b]Note3[/b]: The modified entry will be used to compute match scores, but the original entry will always be preserved in the final output sheet. \n'
        self._popup = HelpMsg(help_text,
                              title='Ignore characters',
                              size_hint=(0.8, 0.8))
        self._popup.open()

    def help_case(self):
        help_text = 'Ignore whether strings are written in upper- or lower-case. If enabled, "hello" will be considered identical to "HELLO".\n\n[b]Note[/b]: The case-corrected entry will be used to compute match scores, but the original entry will always be preserved in the final output sheet.\n'
        self._popup = HelpMsg(help_text,
                              title='Ignore case',
                              size_hint=(0.65, 0.55))
        self._popup.open()

    def help_amperland(self):
        help_text = 'Replace ampersands (&) with "and". Example: if enabled, "Health & Safety" will be considered identical to "Health and Safety".\n\n[b]Note[/b]: The modified entry will be used to compute match scores, but the original entry will always be preserved in the final output sheet.\n'
        self._popup = HelpMsg(help_text,
                              title='Convert ampersands',
                              size_hint=(0.65, 0.55))
        self._popup.open()

    def help_unidecode(self):
        help_text = "Enable to strip accents/diacritics and to transliterate non-Latin characters. Examples: if enabled, ö is equal to o; å is equal to a, ş is equal to s; δ is equal to d; щ is equal to shch; the Hangul character hieut is equal to h.\n\n[b]Note[/b]: The modified entry will be used to compute match scores, but the original entry will always be preserved in the final output sheet.\n"
        self._popup = HelpMsg(help_text,
                              title='ASCII transliteration',
                              size_hint=(0.65, 0.55))
        self._popup.open()

    def help_shortstr(self):
        help_text = 'If disabled, words/strings shorter than the n-gram size (default=3) will not be matched. If enabled, short strings will be padded to the length of the n-gram size. This allows match scores to be computed for short strings. \n\n[b]Note1[/b]: Match scores may be less accurate for padded words/strings, though these scores are often still useful.\n\n[b]Note2[/b]: The padded entry will be used to compute match scores, but the original entry will always be preserved in the final output sheet.\n'
        self._popup = HelpMsg(help_text,
                              title='Short string support',
                              size_hint=(0.75, 0.65))
        self._popup.open()

    def help_whitesp(self):
        help_text = 'If enabled, whitespace will be stripped from each entry before matching. This includes spaces, tabs, line-breaks, etc.\n\n[b]Note[/b]: The modified entry will be used to compute match scores, but the original entry will always be preserved in the final output sheet.\n'
        self._popup = HelpMsg(help_text,
                              title='Ignore whitespace',
                              size_hint=(0.65, 0.5))
        self._popup.open()

    def ok_callback(self):
        if self.sim_input.text == '':
            self._popup = ErrorMsg(
                error_text='Please enter a value for the minimum match score.')
            self._popup.open()
            return
        if self.nmatch_input.text == '':
            self._popup = ErrorMsg(
                error_text=
                'Please enter a value for the maximum number of matches.')
            self._popup.open()
            return
        if self.ngram_input.text == '':
            self._popup = ErrorMsg(
                error_text='Please enter a value for the n-gram size.')
            self._popup.open()
            return
        backend = App.get_running_app().backend
        backend.min_similarity = float(self.sim_input.text)
        backend.max_n_matches = int(self.nmatch_input.text)
        backend.ngram_size = int(self.ngram_input.text)
        backend.excl_chars = self.excl_input.text
        backend.ignore_whitesp = self.check_whitesp.active
        backend.ignore_case = self.check_case.active
        backend.advanced_opts = {
            'amperland': self.check_amperland.active,
            'unidecode': self.check_unidecode.active,
            'shortstr': self.check_shortstr.active
        }
        backend.generate_regex()
        App.get_running_app().nav_to('load_screen', 'left')