def test(self): t = Text('Alexander Tkachenko elab Pärnus') self.assertEqual(t.named_entities, ['Alexander Tkachenko', as_unicode('Pärnu')]) self.assertEqual(t.named_entity_labels, ['PER', 'LOC']) self.assertEqual(t.named_entity_spans, [(0, 19), (25, 31)]) t = Text(as_unicode('Tallinn on Eesti pealinn.')) self.assertEqual(t.named_entities, ['Tallinn', 'Eesti']) self.assertEqual(t.named_entity_labels, ['LOC', 'LOC']) t = Text(as_unicode('Eesti piirneb põhjas üle Soome lahe Soome Vabariigiga.')) self.assertEqual(t.named_entities, ['Eesti', 'Soome laht', 'Soome Vabariik']) self.assertEqual(t.named_entity_labels, ['LOC', 'LOC', 'LOC']) t = Text(as_unicode('2006. aastal valiti presidendiks Toomas Hendrik Ilves.')) self.assertEqual(t.named_entities, ['Toomas Hendrik Ilves']) self.assertEqual(t.named_entity_labels, ['PER']) t = Text(as_unicode('Inimestelt saadud vihjed pole veel politseil aidanud leida 43-aastast Kajar Paasi, kes tema naise sõnul Ardus maanteel rööviti.')) self.assertEqual(t.named_entities, ['Kajar Paasi', 'Ardu']) self.assertEqual(t.named_entity_labels, ['PER', 'LOC']) t = Text(as_unicode('Tuhanded Šotimaa kodud on lääneranniku piirkondi tabanud „ilmapommi“-tormi tõttu elektrita')) self.assertEqual(t.named_entities, [as_unicode('Šotimaa')]) self.assertEqual(t.named_entity_labels, ['LOC']) t = Text(as_unicode('Elion AS ja EMT on Eesti suurimad ettevõted.')) self.assertEqual(t.named_entities, ['Elion AS', 'EMT', 'Eesti']) self.assertEqual(t.named_entity_labels, ['ORG', 'ORG', 'LOC'])
def test(self): fex = GazetteerFeatureExtractor(estnltk.estner.settings) text = Text(as_unicode('Mr Alexander Graham Bell on tuntud teadlane.')) doc = json_document_to_estner_document(text) self.assertEqual(len(doc.tokens), 8) MorphFeatureExtractor().process(doc) LocalFeatureExtractor().process(doc) fex.process(doc) t = doc.tokens[0] self.assertEqual(t.word, 'Mr') self.assertTrue('gaz' not in t) t = doc.tokens[1] self.assertEqual(t.word, 'Alexander') self.assertTrue('gaz' in t) self.assertTrue('peop' in t['gaz']) t = doc.tokens[2] self.assertEqual(t.word, 'Graham') self.assertTrue('gaz' in t) self.assertTrue('peop' in t['gaz']) t = doc.tokens[3] self.assertEqual(t.word, 'Bell') self.assertTrue('gaz' in t) self.assertTrue('peop'in t['gaz']) t = doc.tokens[4] self.assertEqual(t.word, 'on') self.assertTrue('gaz' not in t)
def test(self): t = Token() t.word = as_unicode('Lõuna-Eestis') t.lemma = as_unicode('Lõuna-Eesti+s') t['lem'] = as_unicode('lõuna-eesti') t.morph = '_H_ sg in' fex = LocalFeatureExtractor() fex._process(t) self.assertEqual(t['w'], as_unicode('Lõuna-Eestis')) self.assertEqual(t['wl'], as_unicode('lõuna-eestis')) self.assertEqual(t['shape'], 'ULLLL-ULLLLL') self.assertEqual(t['shaped'], 'UL-UL') self.assertEqual(t['p1'], 'l') self.assertEqual(t['p2'], as_unicode('lõ')) self.assertEqual(t['p3'], as_unicode('lõu')) self.assertEqual(t['p4'], as_unicode('lõun')) self.assertEqual(t['s1'], 'i') self.assertEqual(t['s2'], 'ti') self.assertEqual(t['s3'], 'sti') self.assertEqual(t['s4'], 'esti') self.assertTrue('2d' not in t) self.assertTrue('up' not in t) self.assertTrue('iu' in t) self.assertTrue('au' not in t) self.assertTrue('al' not in t) self.assertTrue('ad' not in t) self.assertTrue('cu' in t) self.assertTrue('cl' in t) self.assertTrue('cd' not in t) self.assertTrue('cp' not in t) self.assertTrue('cds' in t) self.assertTrue('cdt' not in t) self.assertTrue('cs' in t) self.assertEqual(t['bdash'], as_unicode('lõuna')) self.assertEqual(t['adash'], as_unicode('eesti')) self.assertEqual(t['len'], '11' )
def __init__(self,raw_synset): """ Parameters ---------- raw_synset : eurown.Synset Underlying Synset. """ self.name = _get_key_from_raw_synset(raw_synset) self._raw_synset = raw_synset self.id = raw_synset.number or -1 self.pos = as_unicode(raw_synset.pos)
def __init__(self, raw_synset): """ Parameters ---------- raw_synset : eurown.Synset Underlying Synset. """ self.name = _get_key_from_raw_synset(raw_synset) self._raw_synset = raw_synset self.id = raw_synset.number or -1 self.pos = as_unicode(raw_synset.pos)
def process_line(self, line): """Process a line of data. Sends the data through the pipe to the process and flush it. Reads a resulting line and returns it. Parameters ---------- line: str The data sent to process. Make sure it does not contain any newline characters. Returns ------- str: The line returned by the Java process Raises ------ Exception In case of EOF is encountered. IoError In case it was impossible to read or write from the subprocess standard input / output. """ assert isinstance(line, str) try: self._process.stdin.write(as_binary(line)) self._process.stdin.write(as_binary('\n')) self._process.stdin.flush() result = as_unicode(self._process.stdout.readline()) if result == '': stderr = as_unicode(self._process.stderr.read()) raise Exception( 'EOF encountered while reading stream. Stderr is {0}.'. format(stderr)) return result except Exception: self._process.terminate() raise
def process_line(self, line): """Process a line of data. Sends the data through the pipe to the process and flush it. Reads a resulting line and returns it. Parameters ---------- line: str The data sent to process. Make sure it does not contain any newline characters. Returns ------- str: The line returned by the Java process Raises ------ Exception In case of EOF is encountered. IoError In case it was impossible to read or write from the subprocess standard input / output. """ assert isinstance(line, str) try: self._process.stdin.write(as_binary(line)) self._process.stdin.write(as_binary('\n')) self._process.stdin.flush() result = as_unicode(self._process.stdout.readline()) if result == '': stderr = as_unicode(self._process.stderr.read()) raise Exception('EOF encountered while reading stream. Stderr is {0}.'.format(stderr)) return result except Exception: self._process.terminate() raise
def process_lines( self, input_lines, **kwargs ): ''' Executes the pipeline of subsequent VISL_CG3 commands. The first process in pipeline gets input_lines as an input, and each subsequent process gets the output of the previous process as an input. The idea of how to construct the pipeline borrows from: https://github.com/estnltk/estnltk/blob/1.4.0/estnltk/syntax/tagger.py Returns the result of the last process in the pipeline, either as a string or, alternatively, as a list of strings (if split_result == True); Parameters ----------- input_lines : list of str The input text for the pipeline; Should be in same format as the output of SyntaxPreprocessing; split_result : bool Optional argument specifying whether the result should be split by newlines, and returned as a list of strings/lines instead; Default:False remove_info : bool Optional argument specifying whether the additional information added during the preprocessing and syntactic processing should be removed from the results; Default:True; The method cleanup_lines() will be used for removing additional info, and all the parameters passed to this method will be also forwarded to the cleanup method; ''' split_result_lines = False remove_info = True for argName, argVal in kwargs.items() : if argName in ['split_result_lines', 'split_result'] and argVal in [True, False]: split_result_lines = argVal if argName in ['remove_info', 'info_remover', 'clean_up'] and argVal in [True, False]: remove_info = argVal # 1) Construct the input file for the first process in the pipeline temp_input_file = \ tempfile.NamedTemporaryFile(prefix='vislcg3_in.', mode='w', delete=False) temp_input_file.close() # We have to open separately here for writing, because Py 2.7 does not support # passing parameter encoding='utf-8' to the NamedTemporaryFile; out_f = codecs.open(temp_input_file.name, mode='w', encoding='utf-8') for line in input_lines: out_f.write( line.rstrip() ) out_f.write( '\n' ) out_f.close() # TODO: tempfile is currently used to ensure that the input is in 'utf-8', # but perhaps we can somehow ensure it without using tempfile ?? # 2) Dynamically construct the pipeline and open processes pipeline = [] for i in range( len(self.rules_pipeline) ): rule_file = self.rules_pipeline[i] process_cmd = [self.vislcg_cmd, '-o', '-g', os.path.join(self.rules_dir, rule_file)] process = None if i == 0: # The first process takes input from the file process_cmd.extend( ['-I', temp_input_file.name] ) process = Popen(process_cmd, stdin=PIPE, stdout=PIPE) else: # A subsequent process takes output of the last process as an input process = Popen(process_cmd, stdin=pipeline[-1]['process'].stdout, stdout=PIPE) # Record the process process_dict = {'process':process, 'cmd':process_cmd} pipeline.append( process_dict ) # 3) Close all stdout streams, except the last one for i in range( len(pipeline) ): if i != len(pipeline) - 1: pipeline[i]['process'].stdout.close() # 4) Communicate results form the last item in the pipeline result = as_unicode( pipeline[-1]['process'].communicate()[0] ) pipeline[-1]['process'].stdout.close() # Close the last process # Clean-up # 1) remove temp file os.remove(temp_input_file.name) # 2) remove additional info, if required if remove_info: result = '\n'.join( cleanup_lines( result.split('\n'), **kwargs )) return result if not split_result_lines else result.split('\n')
def process_lines(self, input_lines, **kwargs): ''' Executes the pipeline of subsequent VISL_CG3 commands. The first process in pipeline gets input_lines as an input, and each subsequent process gets the output of the previous process as an input. The idea of how to construct the pipeline borrows from: https://github.com/estnltk/estnltk/blob/1.4.0/estnltk/syntax/tagger.py Returns the result of the last process in the pipeline, either as a string or, alternatively, as a list of strings (if split_result == True); Parameters ----------- input_lines : list of str The input text for the pipeline; Should be in same format as the output of SyntaxPreprocessing; split_result : bool Optional argument specifying whether the result should be split by newlines, and returned as a list of strings/lines instead; Default:False remove_info : bool Optional argument specifying whether the additional information added during the preprocessing and syntactic processing should be removed from the results; Default:True; The method cleanup_lines() will be used for removing additional info, and all the parameters passed to this method will be also forwarded to the cleanup method; ''' split_result_lines = False remove_info = True for argName, argVal in kwargs.items(): if argName in ['split_result_lines', 'split_result' ] and argVal in [True, False]: split_result_lines = argVal if argName in ['remove_info', 'info_remover', 'clean_up' ] and argVal in [True, False]: remove_info = argVal # 1) Construct the input file for the first process in the pipeline temp_input_file = \ tempfile.NamedTemporaryFile(prefix='vislcg3_in.', mode='w', delete=False) temp_input_file.close() # We have to open separately here for writing, because Py 2.7 does not support # passing parameter encoding='utf-8' to the NamedTemporaryFile; out_f = codecs.open(temp_input_file.name, mode='w', encoding='utf-8') for line in input_lines: out_f.write(line.rstrip()) out_f.write('\n') out_f.close() # TODO: tempfile is currently used to ensure that the input is in 'utf-8', # but perhaps we can somehow ensure it without using tempfile ?? # 2) Dynamically construct the pipeline and open processes pipeline = [] for i in range(len(self.rules_pipeline)): rule_file = self.rules_pipeline[i] process_cmd = [ self.vislcg_cmd, '-o', '-g', os.path.join(self.rules_dir, rule_file) ] process = None if i == 0: # The first process takes input from the file process_cmd.extend(['-I', temp_input_file.name]) process = Popen(process_cmd, stdin=PIPE, stdout=PIPE) else: # A subsequent process takes output of the last process as an input process = Popen(process_cmd, stdin=pipeline[-1]['process'].stdout, stdout=PIPE) # Record the process process_dict = {'process': process, 'cmd': process_cmd} pipeline.append(process_dict) # 3) Close all stdout streams, except the last one for i in range(len(pipeline)): if i != len(pipeline) - 1: pipeline[i]['process'].stdout.close() # 4) Communicate results form the last item in the pipeline result = as_unicode(pipeline[-1]['process'].communicate()[0]) pipeline[-1]['process'].stdout.close() # Close the last process # Clean-up # 1) remove temp file os.remove(temp_input_file.name) # 2) remove additional info, if required if remove_info: result = '\n'.join(cleanup_lines(result.split('\n'), **kwargs)) return result if not split_result_lines else result.split('\n')