Esempio n. 1
0
    def rerank(self, results):
        '''
        format of the results:
        [ [{_type, _index, sentence,offset]
          [{...}],
          ...
          []
        ]
        '''
        #         with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf:
        #             json.dump(results, mf, indent=2)
        out = deepcopy(results)

        if 'topic_model' in self.opts.features:
            print self.train_raw.keys()
            base_module = import_module('libs.supervised.features.topic_model')
            f = base_module.Feature([
                inst[1]
                for inst in self.train_raw[results[0][0]['topic'].upper()]
            ])
            self.extractors.append(f)

        self.train(train_set=self.train_raw)

        topop = []
        # TOPIC_MODEL (LSA) FEATURE
        # Needs different initialization, that's why it is defined here

        for res in results:
            for i in range(len(res[0]['offset'])):
                for s in res[0]['sentence']:
                    feature_vector = [
                        ext.extract(res[0]['query'], s)
                        for ext in self.extractors
                    ]
                    svm = self.models[res[0]['topic']]
                    print svm.run(feature_vector)
                    import pdb
                    pdb.set_trace()
                print '----'
                #                     svm.predict(feature_vector)

                for j in range(
                        i + 1,
                        min(self.opts.lookup + self.opts.cutoff,
                            len(res[0]['offset']))):
                    # look at lower ranked results, if there are overlaps,
                    # merge
                    newspan = merge_offsets(res[0]['offset'][i],
                                            res[0]['offset'][j],
                                            relaxation=self.opts.relaxation)
                    if newspan is not None:
                        res[0]['offset'][i] = newspan
                        topop.append(j)
            res[0]['offset'] = [
                x for idx1, x in enumerate(res[0]['offset'])
                if idx1 not in topop
            ]
            res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff]
        return results
Esempio n. 2
0
    def rerank(self, results):
        '''
        format of the results:
        [ [{_type, _index, sentence,offset]
          [{...}],
          ...
          []
        ]
        '''
#         with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf:
#             json.dump(results, mf, indent=2)
        out = deepcopy(results)
        topop = []
        topopandrevers = []
        for res in results:
            for i in range(len(res[0]['offset'])):
                for j in range(i + 1, min(self.opts.lookup +
                                          self.opts.cutoff,
                                          len(res[0]['offset']))):
                    # look at lower ranked results, if there are overlaps,
                    # merge
                    newspan = merge_offsets(res[0]['offset'][i],
                                            res[0]['offset'][j],
                                            relaxation=self.opts.relaxation)
                    if newspan is not None:
                        res[0]['offset'][i] = newspan
                        topop.append(j)
            res[0]['offset'] = [
                x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop]
            res[0]['sentence'] = [
                x for idx1, x in enumerate(res[0]['sentence'])
                if idx1 not in topop][0:self.opts.cutoff]
            res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff]
        return results
Esempio n. 3
0
    def rerank(self, results):
        '''
        format of the results:
        [ [{_type, _index, sentence,offset]
          [{...}],
          ...
          []
        ]
        '''
#         with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf:
#             json.dump(results, mf, indent=2)
        out = deepcopy(results)

        if 'topic_model' in self.opts.features:
            print self.train_raw.keys()
            base_module = import_module('libs.supervised.features.topic_model')
            f = base_module.Feature(
                [inst[1] for inst in self.train_raw[results[0][0]['topic'].upper()]])
            self.extractors.append(f)

        self.train(train_set=self.train_raw)

        topop = []
        # TOPIC_MODEL (LSA) FEATURE
        # Needs different initialization, that's why it is defined here

        for res in results:
            for i in range(len(res[0]['offset'])):
                for s in res[0]['sentence']:
                    feature_vector = [
                        ext.extract(res[0]['query'], s) for ext in self.extractors]
                    svm = self.models[res[0]['topic']]
                    print svm.run(feature_vector)
                    import pdb
                    pdb.set_trace()
                print '----'
#                     svm.predict(feature_vector)

                for j in range(i + 1, min(self.opts.lookup +
                                          self.opts.cutoff,
                                          len(res[0]['offset']))):
                    # look at lower ranked results, if there are overlaps,
                    # merge
                    newspan = merge_offsets(res[0]['offset'][i],
                                            res[0]['offset'][j],
                                            relaxation=self.opts.relaxation)
                    if newspan is not None:
                        res[0]['offset'][i] = newspan
                        topop.append(j)
            res[0]['offset'] = [
                x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop]
            res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff]
        return results
Esempio n. 4
0
    def rerank(self, results):
        '''
        format of the results:
        [ [{_type, _index, sentence,offset]
          [{...}],
          ...
          []
        ]
        '''
#         with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf:
#             json.dump(results, mf, indent=2)

#         @object_hashing(
#             cache_comment='res_features_%s' % hash_obj(str(results)),
#             cachedir=constants.get_path()['cache'])
#         def _extract():
#             pass

#         out = deepcopy(results)

        if 'topic_model' in self.opts.features:
            base_module = import_module('libs.supervised.features.topic_model')
            f = base_module.Feature(
                [inst[1] for inst in self.train_raw[results[0][0]['topic'].upper()]])
            self.extractors.append(f)

        self.train(train_set=self.train_raw)

        topop = []
        # TOPIC_MODEL (LSA) FEATURE
        # Needs different initialization, that's why it is defined here

        new_results = []
        for res in results:
            if len(res[0]['offset']) > 0:
                relevance_score = []
                for i in range(len(res[0]['offset'])):
                    s = res[0]['sentence'][i]
                    feature_vector = [
                        ext.extract(res[0]['query'], s) for ext in self.extractors]
                    # (relevance_score, index of the sentence)
                    relevance_score.append(
                        (self.model.predict(feature_vector)[0][0][1], i))
                # sort by relevance score
                relevance_score.sort(key=operator.itemgetter(0), reverse=True)
                offs = [res[0]['offset'][e[1]] for e in relevance_score]
                sents = [res[0]['sentence'][e[1]] for e in relevance_score]
                new_res = [
                    {'offset': offs, 'sentence': sents, 'query': res[0]['query']}]
                for k in res[0]:
                    if k not in new_res[0]:
                        new_res[0][k] = res[0][k]
                new_results.append(new_res)
            else:
                new_results.append(res)


#                     import pdb
#                     pdb.set_trace()
            print '----'

        if self.opts.merge:
            topop = []
            for res in new_results:
                for i in range(len(res[0]['offset'])):
                    for j in range(i + 1, min(self.opts.lookup +
                                              self.opts.cutoff,
                                              len(res[0]['offset']))):
                        # look at lower ranked results, if there are overlaps,
                        # merge
                        newspan = merge_offsets(res[0]['offset'][i],
                                                res[0]['offset'][j],
                                                relaxation=self.opts.relaxation)
                        if newspan is not None:
                            res[0]['offset'][i] = newspan
                            topop.append(j)
                res[0]['offset'] = [
                    x for idx1, x in enumerate(res[0]['offset']) if idx1 not in topop]
                res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff]

#         import pdb
#         pdb.set_trace()
        return new_results
Esempio n. 5
0
    def rerank(self, results):
        '''
        format of the results:
        [ [{_type, _index, sentence,offset]
          [{...}],
          ...
          []
        ]
        '''
        #         with codecs.open('tmp/results.json', 'wb', 'utf-8') as mf:
        #             json.dump(results, mf, indent=2)

        #         @object_hashing(
        #             cache_comment='res_features_%s' % hash_obj(str(results)),
        #             cachedir=constants.get_path()['cache'])
        #         def _extract():
        #             pass

        #         out = deepcopy(results)

        if 'topic_model' in self.opts.features:
            base_module = import_module('libs.supervised.features.topic_model')
            f = base_module.Feature([
                inst[1]
                for inst in self.train_raw[results[0][0]['topic'].upper()]
            ])
            self.extractors.append(f)

        self.train(train_set=self.train_raw)

        topop = []
        # TOPIC_MODEL (LSA) FEATURE
        # Needs different initialization, that's why it is defined here

        new_results = []
        for res in results:
            if len(res[0]['offset']) > 0:
                relevance_score = []
                for i in range(len(res[0]['offset'])):
                    s = res[0]['sentence'][i]
                    feature_vector = [
                        ext.extract(res[0]['query'], s)
                        for ext in self.extractors
                    ]
                    # (relevance_score, index of the sentence)
                    relevance_score.append(
                        (self.model.predict(feature_vector)[0][0][1], i))
                # sort by relevance score
                relevance_score.sort(key=operator.itemgetter(0), reverse=True)
                offs = [res[0]['offset'][e[1]] for e in relevance_score]
                sents = [res[0]['sentence'][e[1]] for e in relevance_score]
                new_res = [{
                    'offset': offs,
                    'sentence': sents,
                    'query': res[0]['query']
                }]
                for k in res[0]:
                    if k not in new_res[0]:
                        new_res[0][k] = res[0][k]
                new_results.append(new_res)
            else:
                new_results.append(res)

#                     import pdb
#                     pdb.set_trace()
            print '----'

        if self.opts.merge:
            topop = []
            for res in new_results:
                for i in range(len(res[0]['offset'])):
                    for j in range(
                            i + 1,
                            min(self.opts.lookup + self.opts.cutoff,
                                len(res[0]['offset']))):
                        # look at lower ranked results, if there are overlaps,
                        # merge
                        newspan = merge_offsets(
                            res[0]['offset'][i],
                            res[0]['offset'][j],
                            relaxation=self.opts.relaxation)
                        if newspan is not None:
                            res[0]['offset'][i] = newspan
                            topop.append(j)
                res[0]['offset'] = [
                    x for idx1, x in enumerate(res[0]['offset'])
                    if idx1 not in topop
                ]
                res[0]['offset'] = res[0]['offset'][0:self.opts.cutoff]

#         import pdb
#         pdb.set_trace()
        return new_results