def rerank(self, results): ''' format of the results: [ [{_type, _index, sentence,offset] [{...}], ... [] ] ''' # with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf: # json.dump(results, mf, indent=2) out = deepcopy(results) if 'topic_model' in self.opts.features: print self.train_raw.keys() base_module = import_module('libs.supervised.features.topic_model') f = base_module.Feature([ inst[1] for inst in self.train_raw[results[0][0]['topic'].upper()] ]) self.extractors.append(f) self.train(train_set=self.train_raw) topop = [] # TOPIC_MODEL (LSA) FEATURE # Needs different initialization, that's why it is defined here for res in results: for i in range(len(res[0]['offset'])): for s in res[0]['sentence']: feature_vector = [ ext.extract(res[0]['query'], s) for ext in self.extractors ] svm = self.models[res[0]['topic']] print svm.run(feature_vector) import pdb pdb.set_trace() print '----' # svm.predict(feature_vector) for j in range( i + 1, min(self.opts.lookup + self.opts.cutoff, len(res[0]['offset']))): # look at lower ranked results, if there are overlaps, # merge newspan = merge_offsets(res[0]['offset'][i], res[0]['offset'][j], relaxation=self.opts.relaxation) if newspan is not None: res[0]['offset'][i] = newspan topop.append(j) res[0]['offset'] = [ x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop ] res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff] return results
def rerank(self, results): ''' format of the results: [ [{_type, _index, sentence,offset] [{...}], ... [] ] ''' # with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf: # json.dump(results, mf, indent=2) out = deepcopy(results) topop = [] topopandrevers = [] for res in results: for i in range(len(res[0]['offset'])): for j in range(i + 1, min(self.opts.lookup + self.opts.cutoff, len(res[0]['offset']))): # look at lower ranked results, if there are overlaps, # merge newspan = merge_offsets(res[0]['offset'][i], res[0]['offset'][j], relaxation=self.opts.relaxation) if newspan is not None: res[0]['offset'][i] = newspan topop.append(j) res[0]['offset'] = [ x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop] res[0]['sentence'] = [ x for idx1, x in enumerate(res[0]['sentence']) if idx1 not in topop][0:self.opts.cutoff] res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff] return results
def rerank(self, results): ''' format of the results: [ [{_type, _index, sentence,offset] [{...}], ... [] ] ''' # with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf: # json.dump(results, mf, indent=2) out = deepcopy(results) if 'topic_model' in self.opts.features: print self.train_raw.keys() base_module = import_module('libs.supervised.features.topic_model') f = base_module.Feature( [inst[1] for inst in self.train_raw[results[0][0]['topic'].upper()]]) self.extractors.append(f) self.train(train_set=self.train_raw) topop = [] # TOPIC_MODEL (LSA) FEATURE # Needs different initialization, that's why it is defined here for res in results: for i in range(len(res[0]['offset'])): for s in res[0]['sentence']: feature_vector = [ ext.extract(res[0]['query'], s) for ext in self.extractors] svm = self.models[res[0]['topic']] print svm.run(feature_vector) import pdb pdb.set_trace() print '----' # svm.predict(feature_vector) for j in range(i + 1, min(self.opts.lookup + self.opts.cutoff, len(res[0]['offset']))): # look at lower ranked results, if there are overlaps, # merge newspan = merge_offsets(res[0]['offset'][i], res[0]['offset'][j], relaxation=self.opts.relaxation) if newspan is not None: res[0]['offset'][i] = newspan topop.append(j) res[0]['offset'] = [ x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop] res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff] return results
def rerank(self, results): ''' format of the results: [ [{_type, _index, sentence,offset] [{...}], ... [] ] ''' # with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf: # json.dump(results, mf, indent=2) # @object_hashing( # cache_comment='res_features_%s' % hash_obj(str(results)), # cachedir=constants.get_path()['cache']) # def _extract(): # pass # out = deepcopy(results) if 'topic_model' in self.opts.features: base_module = import_module('libs.supervised.features.topic_model') f = base_module.Feature( [inst[1] for inst in self.train_raw[results[0][0]['topic'].upper()]]) self.extractors.append(f) self.train(train_set=self.train_raw) topop = [] # TOPIC_MODEL (LSA) FEATURE # Needs different initialization, that's why it is defined here new_results = [] for res in results: if len(res[0]['offset']) > 0: relevance_score = [] for i in range(len(res[0]['offset'])): s = res[0]['sentence'][i] feature_vector = [ ext.extract(res[0]['query'], s) for ext in self.extractors] # (relevance_score, index of the sentence) relevance_score.append( (self.model.predict(feature_vector)[0][0][1], i)) # sort by relevance score relevance_score.sort(key=operator.itemgetter(0), reverse=True) offs = [res[0]['offset'][e[1]] for e in relevance_score] sents = [res[0]['sentence'][e[1]] for e in relevance_score] new_res = [ {'offset': offs, 'sentence': sents, 'query': res[0]['query']}] for k in res[0]: if k not in new_res[0]: new_res[0][k] = res[0][k] new_results.append(new_res) else: new_results.append(res) # import pdb # pdb.set_trace() print '----' if self.opts.merge: topop = [] for res in new_results: for i in range(len(res[0]['offset'])): for j in range(i + 1, min(self.opts.lookup + self.opts.cutoff, len(res[0]['offset']))): # look at lower ranked results, if there are overlaps, # merge newspan = merge_offsets(res[0]['offset'][i], res[0]['offset'][j], relaxation=self.opts.relaxation) if newspan is not None: res[0]['offset'][i] = newspan topop.append(j) res[0]['offset'] = [ x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop] res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff] # import pdb # pdb.set_trace() return new_results
def rerank(self, results): ''' format of the results: [ [{_type, _index, sentence,offset] [{...}], ... [] ] ''' # with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf: # json.dump(results, mf, indent=2) # @object_hashing( # cache_comment='res_features_%s' % hash_obj(str(results)), # cachedir=constants.get_path()['cache']) # def _extract(): # pass # out = deepcopy(results) if 'topic_model' in self.opts.features: base_module = import_module('libs.supervised.features.topic_model') f = base_module.Feature([ inst[1] for inst in self.train_raw[results[0][0]['topic'].upper()] ]) self.extractors.append(f) self.train(train_set=self.train_raw) topop = [] # TOPIC_MODEL (LSA) FEATURE # Needs different initialization, that's why it is defined here new_results = [] for res in results: if len(res[0]['offset']) > 0: relevance_score = [] for i in range(len(res[0]['offset'])): s = res[0]['sentence'][i] feature_vector = [ ext.extract(res[0]['query'], s) for ext in self.extractors ] # (relevance_score, index of the sentence) relevance_score.append( (self.model.predict(feature_vector)[0][0][1], i)) # sort by relevance score relevance_score.sort(key=operator.itemgetter(0), reverse=True) offs = [res[0]['offset'][e[1]] for e in relevance_score] sents = [res[0]['sentence'][e[1]] for e in relevance_score] new_res = [{ 'offset': offs, 'sentence': sents, 'query': res[0]['query'] }] for k in res[0]: if k not in new_res[0]: new_res[0][k] = res[0][k] new_results.append(new_res) else: new_results.append(res) # import pdb # pdb.set_trace() print '----' if self.opts.merge: topop = [] for res in new_results: for i in range(len(res[0]['offset'])): for j in range( i + 1, min(self.opts.lookup + self.opts.cutoff, len(res[0]['offset']))): # look at lower ranked results, if there are overlaps, # merge newspan = merge_offsets( res[0]['offset'][i], res[0]['offset'][j], relaxation=self.opts.relaxation) if newspan is not None: res[0]['offset'][i] = newspan topop.append(j) res[0]['offset'] = [ x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop ] res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff] # import pdb # pdb.set_trace() return new_results