Ejemplo n.º 1
0
    def test(self):
        t = Text('Alexander Tkachenko elab Pärnus')
        self.assertEqual(t.named_entities, ['Alexander Tkachenko', as_unicode('Pärnu')])
        self.assertEqual(t.named_entity_labels, ['PER', 'LOC'])
        self.assertEqual(t.named_entity_spans, [(0, 19), (25, 31)])
        
        t = Text(as_unicode('Tallinn on Eesti pealinn.'))
        self.assertEqual(t.named_entities, ['Tallinn', 'Eesti'])
        self.assertEqual(t.named_entity_labels, ['LOC', 'LOC'])
        
        t = Text(as_unicode('Eesti piirneb põhjas üle Soome lahe Soome Vabariigiga.'))
        self.assertEqual(t.named_entities, ['Eesti', 'Soome laht', 'Soome Vabariik'])
        self.assertEqual(t.named_entity_labels, ['LOC', 'LOC', 'LOC'])

        t = Text(as_unicode('2006. aastal valiti presidendiks Toomas Hendrik Ilves.'))
        self.assertEqual(t.named_entities, ['Toomas Hendrik Ilves'])
        self.assertEqual(t.named_entity_labels, ['PER'])
        
        t = Text(as_unicode('Inimestelt saadud vihjed pole veel politseil aidanud leida 43-aastast Kajar Paasi, kes tema naise sõnul Ardus maanteel rööviti.'))
        self.assertEqual(t.named_entities, ['Kajar Paasi', 'Ardu'])
        self.assertEqual(t.named_entity_labels, ['PER', 'LOC'])
        
        t = Text(as_unicode('Tuhanded Šotimaa kodud on lääneranniku piirkondi tabanud „ilmapommi“-tormi tõttu elektrita'))
        self.assertEqual(t.named_entities, [as_unicode('Šotimaa')])
        self.assertEqual(t.named_entity_labels, ['LOC'])
        
        t = Text(as_unicode('Elion AS ja EMT on Eesti suurimad ettevõted.'))
        self.assertEqual(t.named_entities, ['Elion AS', 'EMT', 'Eesti'])
        self.assertEqual(t.named_entity_labels, ['ORG', 'ORG', 'LOC'])
        
        
Ejemplo n.º 2
0
    def test(self):
        fex = GazetteerFeatureExtractor(estnltk.estner.settings)

        text = Text(as_unicode('Mr Alexander Graham Bell on tuntud teadlane.'))
        doc = json_document_to_estner_document(text)
        
        self.assertEqual(len(doc.tokens), 8)
        
        MorphFeatureExtractor().process(doc)
        LocalFeatureExtractor().process(doc)
        fex.process(doc)
        
        t = doc.tokens[0]
        self.assertEqual(t.word, 'Mr')
        self.assertTrue('gaz' not in t)
        
        t = doc.tokens[1]
        self.assertEqual(t.word, 'Alexander')
        self.assertTrue('gaz' in t)
        self.assertTrue('peop' in t['gaz'])
        
        t = doc.tokens[2]
        self.assertEqual(t.word, 'Graham')
        self.assertTrue('gaz' in t)
        self.assertTrue('peop' in t['gaz'])
        
        t = doc.tokens[3]
        self.assertEqual(t.word, 'Bell')
        self.assertTrue('gaz' in t)
        self.assertTrue('peop'in t['gaz'])
        
        t = doc.tokens[4]
        self.assertEqual(t.word, 'on')
        self.assertTrue('gaz' not in t)
Ejemplo n.º 3
0
 def test(self):
     t = Token()
     t.word = as_unicode('Lõuna-Eestis')
     t.lemma = as_unicode('Lõuna-Eesti+s')
     t['lem'] = as_unicode('lõuna-eesti')
     t.morph = '_H_ sg in'
     
     fex = LocalFeatureExtractor()
     fex._process(t)
     
     self.assertEqual(t['w'], as_unicode('Lõuna-Eestis'))
     self.assertEqual(t['wl'], as_unicode('lõuna-eestis'))
     self.assertEqual(t['shape'], 'ULLLL-ULLLLL')
     self.assertEqual(t['shaped'], 'UL-UL')
     
     self.assertEqual(t['p1'], 'l')
     self.assertEqual(t['p2'], as_unicode('lõ'))
     self.assertEqual(t['p3'], as_unicode('lõu'))
     self.assertEqual(t['p4'], as_unicode('lõun'))
     
     self.assertEqual(t['s1'], 'i')
     self.assertEqual(t['s2'], 'ti')
     self.assertEqual(t['s3'], 'sti')
     self.assertEqual(t['s4'], 'esti')
     
     self.assertTrue('2d' not in t)
     self.assertTrue('up' not in t)
     
     self.assertTrue('iu' in t)
     self.assertTrue('au' not in t)
     self.assertTrue('al' not in t)
     self.assertTrue('ad' not in t)
     
     self.assertTrue('cu' in t)
     self.assertTrue('cl' in t)
     self.assertTrue('cd' not in t)
     self.assertTrue('cp' not in t)
     self.assertTrue('cds' in t)
     self.assertTrue('cdt' not in t)
     self.assertTrue('cs' in t)
     
     self.assertEqual(t['bdash'], as_unicode('lõuna'))
     self.assertEqual(t['adash'], as_unicode('eesti'))
     
     self.assertEqual(t['len'], '11' )
Ejemplo n.º 4
0
 def __init__(self,raw_synset):
     """
     Parameters
     ----------
       raw_synset : eurown.Synset
     Underlying Synset.
       
     """
     self.name = _get_key_from_raw_synset(raw_synset)
     self._raw_synset = raw_synset
     self.id = raw_synset.number or -1
     self.pos = as_unicode(raw_synset.pos)
Ejemplo n.º 5
0
 def __init__(self, raw_synset):
     """
     Parameters
     ----------
       raw_synset : eurown.Synset
     Underlying Synset.
       
     """
     self.name = _get_key_from_raw_synset(raw_synset)
     self._raw_synset = raw_synset
     self.id = raw_synset.number or -1
     self.pos = as_unicode(raw_synset.pos)
Ejemplo n.º 6
0
    def process_line(self, line):
        """Process a line of data.
        
        Sends the data through the pipe to the process and flush it. Reads a resulting line
        and returns it.
        
        Parameters
        ----------
        
        line: str
            The data sent to process. Make sure it does not contain any newline characters.

        Returns
        -------
        str: The line returned by the Java process
        
        Raises
        ------
        Exception
            In case of EOF is encountered.
        IoError
            In case it was impossible to read or write from the subprocess standard input / output.
        """
        assert isinstance(line, str)
        try:
            self._process.stdin.write(as_binary(line))
            self._process.stdin.write(as_binary('\n'))
            self._process.stdin.flush()
            result = as_unicode(self._process.stdout.readline())
            if result == '':
                stderr = as_unicode(self._process.stderr.read())
                raise Exception(
                    'EOF encountered while reading stream. Stderr is {0}.'.
                    format(stderr))
            return result
        except Exception:
            self._process.terminate()
            raise
Ejemplo n.º 7
0
    def process_line(self, line):
        """Process a line of data.
        
        Sends the data through the pipe to the process and flush it. Reads a resulting line
        and returns it.
        
        Parameters
        ----------
        
        line: str
            The data sent to process. Make sure it does not contain any newline characters.

        Returns
        -------
        str: The line returned by the Java process
        
        Raises
        ------
        Exception
            In case of EOF is encountered.
        IoError
            In case it was impossible to read or write from the subprocess standard input / output.
        """
        assert isinstance(line, str)
        try:
            self._process.stdin.write(as_binary(line))
            self._process.stdin.write(as_binary('\n'))
            self._process.stdin.flush()
            result = as_unicode(self._process.stdout.readline())
            if result == '':
                stderr = as_unicode(self._process.stderr.read())
                raise Exception('EOF encountered while reading stream. Stderr is {0}.'.format(stderr))
            return result
        except Exception:
            self._process.terminate()
            raise
Ejemplo n.º 8
0
    def process_lines( self, input_lines, **kwargs ):
        ''' Executes the pipeline of subsequent VISL_CG3 commands. The first process
            in pipeline gets input_lines as an input, and each subsequent process gets
            the output of the previous process as an input.
            
            The idea of how to construct the pipeline borrows from:
              https://github.com/estnltk/estnltk/blob/1.4.0/estnltk/syntax/tagger.py
            
            Returns the result of the last process in the pipeline, either as a string 
            or, alternatively, as a list of strings (if split_result == True);
 
            Parameters
            -----------
            input_lines : list of str
                 The input text for the pipeline; Should be in same format as the output
                 of SyntaxPreprocessing;

            split_result : bool
                 Optional argument specifying whether the result should be split by 
                 newlines, and returned as a list of strings/lines instead;
                 Default:False

            remove_info : bool
                 Optional argument specifying whether the additional information added 
                 during the preprocessing and syntactic processing should be removed
                 from the results;
                 Default:True;
                 The method  cleanup_lines()  will be used for removing additional info,
                 and all the parameters passed to this method will be also forwarded to 
                 the cleanup method;

        '''
        split_result_lines = False
        remove_info = True
        for argName, argVal in kwargs.items() :
            if argName in ['split_result_lines', 'split_result'] and argVal in [True, False]:
               split_result_lines = argVal
            if argName in ['remove_info', 'info_remover', 'clean_up'] and argVal in [True, False]:
               remove_info = argVal

        # 1) Construct the input file for the first process in the pipeline
        temp_input_file = \
            tempfile.NamedTemporaryFile(prefix='vislcg3_in.', mode='w', delete=False)
        temp_input_file.close()
        # We have to open separately here for writing, because Py 2.7 does not support
        # passing parameter   encoding='utf-8'    to the NamedTemporaryFile;
        out_f = codecs.open(temp_input_file.name, mode='w', encoding='utf-8')
        for line in input_lines:
            out_f.write( line.rstrip() )
            out_f.write( '\n' )
        out_f.close()
        # TODO: tempfile is currently used to ensure that the input is in 'utf-8',
        #       but perhaps we can somehow ensure it without using tempfile ??


        # 2) Dynamically construct the pipeline and open processes
        pipeline = []
        for i in range( len(self.rules_pipeline) ):
            rule_file = self.rules_pipeline[i]
            process_cmd = [self.vislcg_cmd, '-o', '-g', os.path.join(self.rules_dir, rule_file)]
            process = None
            if i == 0:
               # The first process takes input from the file
               process_cmd.extend( ['-I', temp_input_file.name] )
               process = Popen(process_cmd, stdin=PIPE, stdout=PIPE)
            else:
               # A subsequent process takes output of the last process as an input
               process = Popen(process_cmd, stdin=pipeline[-1]['process'].stdout, stdout=PIPE)
            # Record the process
            process_dict = {'process':process, 'cmd':process_cmd}
            pipeline.append( process_dict )
        
        # 3) Close all stdout streams, except the last one
        for i in range( len(pipeline) ):
           if i != len(pipeline) - 1:
              pipeline[i]['process'].stdout.close()

        # 4) Communicate results form the last item in the pipeline
        result = as_unicode( pipeline[-1]['process'].communicate()[0] )
        pipeline[-1]['process'].stdout.close() # Close the last process

        # Clean-up
        # 1) remove temp file
        os.remove(temp_input_file.name)

        # 2) remove additional info, if required
        if remove_info:
              result = '\n'.join( cleanup_lines( result.split('\n'), **kwargs ))

        return result if not split_result_lines else result.split('\n')
Ejemplo n.º 9
0
    def process_lines(self, input_lines, **kwargs):
        ''' Executes the pipeline of subsequent VISL_CG3 commands. The first process
            in pipeline gets input_lines as an input, and each subsequent process gets
            the output of the previous process as an input.
            
            The idea of how to construct the pipeline borrows from:
              https://github.com/estnltk/estnltk/blob/1.4.0/estnltk/syntax/tagger.py
            
            Returns the result of the last process in the pipeline, either as a string 
            or, alternatively, as a list of strings (if split_result == True);
 
            Parameters
            -----------
            input_lines : list of str
                 The input text for the pipeline; Should be in same format as the output
                 of SyntaxPreprocessing;

            split_result : bool
                 Optional argument specifying whether the result should be split by 
                 newlines, and returned as a list of strings/lines instead;
                 Default:False

            remove_info : bool
                 Optional argument specifying whether the additional information added 
                 during the preprocessing and syntactic processing should be removed
                 from the results;
                 Default:True;
                 The method  cleanup_lines()  will be used for removing additional info,
                 and all the parameters passed to this method will be also forwarded to 
                 the cleanup method;

        '''
        split_result_lines = False
        remove_info = True
        for argName, argVal in kwargs.items():
            if argName in ['split_result_lines', 'split_result'
                           ] and argVal in [True, False]:
                split_result_lines = argVal
            if argName in ['remove_info', 'info_remover', 'clean_up'
                           ] and argVal in [True, False]:
                remove_info = argVal

        # 1) Construct the input file for the first process in the pipeline
        temp_input_file = \
            tempfile.NamedTemporaryFile(prefix='vislcg3_in.', mode='w', delete=False)
        temp_input_file.close()
        # We have to open separately here for writing, because Py 2.7 does not support
        # passing parameter   encoding='utf-8'    to the NamedTemporaryFile;
        out_f = codecs.open(temp_input_file.name, mode='w', encoding='utf-8')
        for line in input_lines:
            out_f.write(line.rstrip())
            out_f.write('\n')
        out_f.close()
        # TODO: tempfile is currently used to ensure that the input is in 'utf-8',
        #       but perhaps we can somehow ensure it without using tempfile ??

        # 2) Dynamically construct the pipeline and open processes
        pipeline = []
        for i in range(len(self.rules_pipeline)):
            rule_file = self.rules_pipeline[i]
            process_cmd = [
                self.vislcg_cmd, '-o', '-g',
                os.path.join(self.rules_dir, rule_file)
            ]
            process = None
            if i == 0:
                # The first process takes input from the file
                process_cmd.extend(['-I', temp_input_file.name])
                process = Popen(process_cmd, stdin=PIPE, stdout=PIPE)
            else:
                # A subsequent process takes output of the last process as an input
                process = Popen(process_cmd,
                                stdin=pipeline[-1]['process'].stdout,
                                stdout=PIPE)
            # Record the process
            process_dict = {'process': process, 'cmd': process_cmd}
            pipeline.append(process_dict)

        # 3) Close all stdout streams, except the last one
        for i in range(len(pipeline)):
            if i != len(pipeline) - 1:
                pipeline[i]['process'].stdout.close()

        # 4) Communicate results form the last item in the pipeline
        result = as_unicode(pipeline[-1]['process'].communicate()[0])
        pipeline[-1]['process'].stdout.close()  # Close the last process

        # Clean-up
        # 1) remove temp file
        os.remove(temp_input_file.name)

        # 2) remove additional info, if required
        if remove_info:
            result = '\n'.join(cleanup_lines(result.split('\n'), **kwargs))

        return result if not split_result_lines else result.split('\n')