def extract_features(self, sentence, i, window_size=3):
     def recent_year(word):
         try:
             return 1990 <= int(word) <= 2012
         except ValueError:
             return False
     
     features = {}
     for j in xrange(-window_size, window_size + 1):
         if 0 <= i + j < len(sentence):
             word = sentence[i + j]
             lemma = word.lemma
             if is_numeric(lemma):
                 lemma = str(int(atof(lemma)))
             segment = word.segment
             tag = word.tag
             word_features = {
                 'segment': segment,
                 'tag': lt.get_tag(tag),
                 'case': lt.get_case(tag),
                 'number': lt.get_number(tag),
                 'gender': lt.get_gender(tag),
                 'person': lt.get_person(tag),
                 'aspect': lt.get_aspect(tag),
                 'lemma': lemma,
                 'recent_year': str(int(recent_year(lemma))),
                 'alldigits': str(int(lemma.isdigit())),
                 'allalpha': str(int(lemma.decode('utf-8').isalpha())),
                 'starts_with_capital': str(int(lemma.decode('utf-8')[0].isupper())),
                 'segm_starts_with_capital': str(int(segment.decode('utf-8')[0].isupper())),
                 'numeric': str(int(is_numeric(lemma))),
                 'len': str(len(lemma)),
             }
             if use_parser:
                 word_features['parse'] = word.parse
             if use_wordnet:
                 word_features['synset'] = get_hypernym(word)
             for name, feature in word_features.iteritems():
                 features['%d%s' % (j, name)] = feature
     return features
 def extract_values(self, extracted_sentences, confidence_level=.8):
     setlocale(LC_NUMERIC, 'pl_PL.UTF-8')
     sentences = [
         sentence
         for entity, sentences in extracted_sentences.iteritems()
         for sentence in sentences
     ]
     self.save_features_to_file(self.features_tag_filename, sentences)
     command = 'crfsuite tag -i -m %s %s' % (models_cache_path % self.model_filename, models_cache_path % self.features_tag_filename)
     p = Popen(command, stdout=PIPE, stdin=PIPE, stderr=PIPE, shell=True)
     out, _ = p.communicate()
     tags_list = []
     tags = []
     for line in out.split('\n')[:-1]:
         if not line:
             tags_list.append(tags)
             tags = []
         else:
             tags.append((line[0], float(line[2:])))
     extracted_values = {}
     i = 0
     for entity, sentences in extracted_sentences.iteritems():
         values = []
         for sentence in sentences:
             tags = tags_list[i]
             i += 1
             value = []
             value_prob = 1
             #automatically join hyphenated words
             for j, word in enumerate(sentence):
                 if word.lemma == '-':
                     tags[j] = ('1', 1)
             for word, (tag, p) in zip(sentence, tags) + [('', ('0', 1))]:
                 if tag == '1':
                     if word.lemma == '-' and not value:
                         continue
                     value.append(word.lemma)
                     value_prob = min(value_prob, p)
                 elif value:
                     if value[-1] == '-':
                         value.pop()
                     if not value:
                         continue
                     v = '_'.join(value).replace('_-_', '-')
                     value = []
                     #gmina can have the same name as its main city (in fact, it very often does)
                     if v.decode('utf-8')[:4] != entity.decode('utf-8')[:4] or self.predicate in ['gmina']:
                         values.append((v, value_prob))
                     value_prob = 1
         #sort by decreasing probabilities
         values = filter(lambda (_, p): p > confidence_level, values)
         values.sort(key=lambda (_, p): -p)
         values = map(lambda (v, p): (str(int(atof(v))) if is_numeric(v) else v, p), values)
         values = map(
             lambda (v, p): ((v.decode('utf-8')[0].upper() + v.decode('utf-8')[1:]).encode('utf-8') if '_' in v else v, p), 
             values
         )
         if verbose:
             print entity, values
         values = [v for v, _ in values]
         if values:
             if self.predicate in numeric_predicates:
                 extracted_values[entity] = values[0]
                 continue
             #to increase precision of extraction (at the cost of recall) in textual relations, 
             #only values that are geographic entities in DBPedia are returned
             values_identified_as_entities = [
                 v for v in values if lt.is_entity(v)
             ]
             values_identified_as_entities_of_right_type = [
                 v for v in values_identified_as_entities if\
                 any(entities_types.index(t) in lt.entities[v] for t in self.predominant_types)
             ]
             if verbose:
                 print ' '.join(values),
                 print ' '.join(values_identified_as_entities),
                 print ' '.join(values_identified_as_entities_of_right_type)
             if values_identified_as_entities_of_right_type:
                 extracted_values[entity] = values_identified_as_entities_of_right_type[0]
             elif values_identified_as_entities:
                 extracted_values[entity] = values_identified_as_entities[0]
             elif self.predicate in ['gmina', 'powiat', quote_plus('województwo'), 'hrabstwo', 'stan']:
                 extracted_values[entity] = values[0]
     if not save_to_cache:
         try:
             os.remove(models_cache_path % self.model_filename)
         except IOError:
             pass
     return extracted_values
                
 def extract_values(self, extracted_sentences, confidence_level=.8):
     setlocale(LC_NUMERIC, 'pl_PL.UTF-8')
     sentences = [
         sentence
         for entity, sentences in extracted_sentences.iteritems()
         for sentence in sentences
     ]
     self.save_features_to_file(self.features_tag_filename, sentences)
     command = 'crfsuite tag -i -m %s %s' % (models_cache_path % self.model_filename, models_cache_path % self.features_tag_filename)
     p = Popen(command, stdout=PIPE, stdin=PIPE, stderr=PIPE, shell=True)
     out, _ = p.communicate()
     tags_list = []
     tags = []
     for line in out.split('\n')[:-1]:
         if not line:
             tags_list.append(tags)
             tags = []
         else:
             tags.append((line[0], float(line[2:])))
     extracted_values = {}
     i = 0
     for entity, sentences in extracted_sentences.iteritems():
         values = []
         for sentence in sentences:
             tags = tags_list[i]
             i += 1
             value = []
             value_prob = 1
             #automatically join hyphenated words
             for j, word in enumerate(sentence):
                 if word.lemma == '-':
                     tags[j] = ('1', 1)
             for word, (tag, p) in zip(sentence, tags) + [('', ('0', 1))]:
                 if tag == '1':
                     if word.lemma == '-' and not value:
                         continue
                     value.append(word.lemma)
                     value_prob = min(value_prob, p)
                 elif value:
                     if value[-1] == '-':
                         value.pop()
                     if not value:
                         continue
                     v = '_'.join(value).replace('_-_', '-')
                     value = []
                     #gmina can have the same name as its main city (in fact, it very often does)
                     if v.decode('utf-8')[:4] != entity.decode('utf-8')[:4] or self.predicate in ['gmina']:
                         values.append((v, value_prob))
                     value_prob = 1
         #sort by decreasing probabilities
         values = filter(lambda (_, p): p > confidence_level, values)
         values.sort(key=lambda (_, p): -p)
         values = map(lambda (v, p): (str(int(atof(v))) if is_numeric(v) else v, p), values)
         values = map(
             lambda (v, p): ((v.decode('utf-8')[0].upper() + v.decode('utf-8')[1:]).encode('utf-8') if '_' in v else v, p), 
             values
         )
         if verbose:
             print entity, values
         values = [v for v, _ in values]
         if values:
             if self.predicate in numeric_predicates:
                 extracted_values[entity] = values[0]
                 continue
             #to increase precision of extraction (at the cost of recall) in textual relations, 
             #only values that are geographic entities in DBPedia are returned
             values_identified_as_entities = [
                 v for v in values if lt.is_entity(v)
             ]
             values_identified_as_entities_of_right_type = [
                 v for v in values_identified_as_entities if\
                 any(entities_types.index(t) in lt.entities[v] for t in self.predominant_types)
             ]
             if verbose:
                 print ' '.join(values),
                 print ' '.join(values_identified_as_entities),
                 print ' '.join(values_identified_as_entities_of_right_type)
             if values_identified_as_entities_of_right_type:
                 extracted_values[entity] = values_identified_as_entities_of_right_type[0]
             elif values_identified_as_entities:
                 extracted_values[entity] = values_identified_as_entities[0]
             elif self.predicate in ['gmina', 'powiat', quote_plus('województwo'), 'hrabstwo', 'stan']:
                 extracted_values[entity] = values[0]
     if not save_to_cache:
         try:
             os.remove(models_cache_path % self.model_filename)
         except IOError:
             pass
     return extracted_values