Ejemplo n.º 1
0
    def save_cache(self):
        output_filename = FileLocations.get_dropbox_intermediate_path(
        ) + 'spotlight_docs_list.pickle'
        temp_filename = FileLocations.get_dropbox_intermediate_path(
        ) + "spotlight_docs_list.tmp.2.pickle"
        lock_acquired = False
        while not lock_acquired:
            try:
                if not os.path.isfile(output_filename) and not os.path.isfile(
                        temp_filename):
                    pass
                else:
                    move(output_filename, temp_filename)
            except:
                time.sleep(3)
            else:
                lock_acquired = True
                self.logger.info('lock acquired')
        # do your writing
        self.logger.info('About to write %s', temp_filename)
        with open(temp_filename, 'wb') as handle:
            pickle.dump(self.cache, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)

        # release file by moving it back
        move(temp_filename, output_filename)
        # lock_acquired = False
        self.logger.info('lock released')
Ejemplo n.º 2
0
    def load_light_parameter_data_fmt_03():
        # Format
        # First Row : text headers
        # Second row onwards comma separated
        #    DocId,
        #    Entity_id as per wikipedia
        #    Golden Salience as per Trani et al. (saliency as marked by expert annotators ( 0.0 < value <= 3.0 ))
        #    Estimated salience (will be rubbish if the binary classifier has not been trained)
        #    Light features written in list [ 1, 2, 3, None, 7, ...]
        #
        light_param_filename = FileLocations.get_dropbox_intermediate_path() \
                               + 'dexter_light_features_fmt_v03.run.01.txt'
        BinaryClassifierTrainer.logger.info('loading data from : %s',
                                            light_param_filename)

        with open(light_param_filename) as f:
            lines = f.readlines()

        transformed_contents = ''
        for line in lines:
            if len(line.split(sep=',')) != 27:
                pass  # skip line - it is a header
                BinaryClassifierTrainer.logger.info('skipping line: %s', line)
            else:
                line = line.replace('[', '')
                line = line.replace(']', '')
                line = line.replace('None', '0.0')
                transformed_contents = transformed_contents + '\n' + line

        fn = FileLocations.get_temp_path() + 'light_output_intermediate.txt'
        file = open(fn, "w")
        file.write(transformed_contents)
        file.close()

        data = genfromtxt(fn, delimiter=',')
        entity_id_array = data[:, 1]
        y = data[:, 2]
        X = data[:, 4:]

        y = y > 0.0  # convert y to only 1.0 and 0.0

        # Overwrite NaNs with 0s
        BinaryClassifierTrainer.logger.debug('Are there NaNs?')
        nan_locations = np.argwhere(np.isnan(X))
        for loc in nan_locations:
            X[loc[0], loc[1]] = 0
        BinaryClassifierTrainer.logger.debug('Are there still any NaNs?')
        nan_locations = np.argwhere(np.isnan(X))
        BinaryClassifierTrainer.logger.debug(nan_locations)

        BinaryClassifierTrainer.logger.info('X Shape %s', X.shape)

        return X, y, entity_id_array
Ejemplo n.º 3
0
    def load_light_parameter_data_fmt_02():
        #
        # First column is whether the entity was marked as salient by expert annotators in
        # the dexter corpus ( value > 0.0 )
        # Second Column is the salience as predicted when this was passed through the pipeline.
        # Third column is a list of feature values
        #
        light_param_filename = FileLocations.get_dropbox_intermediate_path() + \
                               'dexter_light_features_fmt_v02_partial.03.docs-1-195.txt'
        BinaryClassifierTrainer.logger.info('loading data from : %s',
                                            light_param_filename)
        data = genfromtxt(light_param_filename, delimiter=',')
        y = data[:, 0]
        X = data[:, 2:]

        y = y > 0.0

        # Overwrite NaNs with 0s
        # Overwrite NaNs with 0s
        BinaryClassifierTrainer.logger.debug('Are there NaNs?')
        nan_locations = np.argwhere(np.isnan(X))
        for loc in nan_locations:
            X[loc[0], loc[1]] = 0
        BinaryClassifierTrainer.logger.debug('Are there still any NaNs?')
        nan_locations = np.argwhere(np.isnan(X))
        BinaryClassifierTrainer.logger.debug(nan_locations)
        BinaryClassifierTrainer.logger.info('X Shape %s', X.shape)
        return X, y
Ejemplo n.º 4
0
 def save_model(self):
     dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path(
     )
     output_filename = dropbox_intermediate_path + 'binary_classifier.pickle'
     self.logger.info('About to write %s', output_filename)
     with open(output_filename, 'wb') as handle:
         pickle.dump(self.model, handle, protocol=pickle.HIGHEST_PROTOCOL)
     self.logger.info('file written = %s', output_filename)
Ejemplo n.º 5
0
 def save_partial_results(self, prefix, obj):
     output_filename = FileLocations.get_temp_path(
     ) + prefix + '.partial.pickle'
     self.logger.info('About to write %s', output_filename)
     try:
         with open(output_filename, 'wb') as handle:
             pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
     except OSError as e:
         self.logger.warning('Could not save file %s. err=%s',
                             output_filename, str(e))
     self.logger.info('file written = %s', output_filename)
Ejemplo n.º 6
0
 def load_model(self):
     dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path(
     )
     input_filename = dropbox_intermediate_path + 'binary_classifier.pickle'
     if os.path.isfile(input_filename):
         self.logger.info('loading binary classifier from %s',
                          input_filename)
         with open(input_filename, 'rb') as handle:
             self.model = pickle.load(handle)
         self.logger.info('loaded')
     else:
         self.logger.info('Could not load model %s', input_filename)
 def __init__(self):
     # set up logging
     handler = logging.StreamHandler()
     handler.setFormatter(
         logging.Formatter(
             '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
     self.logger = logging.getLogger(__name__)
     self.logger.addHandler(handler)
     self.logger.propagate = False
     self.logger.setLevel(logging.INFO)
     # set up instance variables
     wds = WikipediaDataset()
     self.intermediate_path = FileLocations.get_temp_path()
     self.spotlight_util = SpotlightUtil()
Ejemplo n.º 8
0
    def load_cache(self):
        input_filename = FileLocations.get_dropbox_intermediate_path(
        ) + 'spotlight_docs_list.pickle'

        count = 0
        if count < 3 and not os.path.isfile(input_filename):
            # may be being written to by another process, wait
            time.sleep(1)

        if os.path.isfile(input_filename):
            self.logger.info('loading spotlight cache from %s', input_filename)
            with open(input_filename, 'rb') as handle:
                self.cache = pickle.load(handle)
            self.logger.info('%d items loaded', len(self.cache.values()))
        else:
            self.logger.info('file does not exists %s', input_filename)
            self.cache = {}
Ejemplo n.º 9
0
    def load_light_parameter_data_fmt_01():
        #
        # First column is whether the entity was marked as salient by expert annotators in the dexter corpus
        #
        dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path(
        )
        data = genfromtxt(dropbox_intermediate_path +
                          'dexter_light_features_v0001.txt',
                          delimiter=',')
        y = data[:, 0]
        X = data[:, 1:-1]

        # Overwrite NaNs with 0s
        BinaryClassifierTrainer.logger.debug('Are there NaNs?')
        nan_locations = np.argwhere(np.isnan(X))
        for loc in nan_locations:
            X[loc[0], loc[1]] = 0
        BinaryClassifierTrainer.logger.debug('Are there still any NaNs?')
        nan_locations = np.argwhere(np.isnan(X))
        BinaryClassifierTrainer.logger.debug(nan_locations)
        BinaryClassifierTrainer.logger.info('X Shape %s', X.shape)
        return X, y
Ejemplo n.º 10
0
    def process_document(self,
                         optional_docid,
                         body,
                         title,
                         file_prefix,
                         break_early=False,
                         golden_salience_by_entity_id={},
                         min_candidates_to_pass_through=0,
                         binary_classifier_threshold=0.5,
                         spotter_confidence=0.5):
        if golden_salience_by_entity_id is None:
            golden_salience_by_entity_id = {}
        light_features_by_ent_id, name_by_entity_id = \
            self.light_feature_extractor.get_feature_list_by_ent(body, title,
                                                                 self.spotter,
                                                                 False,
                                                                 spotter_confidence=spotter_confidence)

        self.logger.info('Light features have been calculated ')
        survivor_candidates = []
        predictions_by_entity_id = {}

        light_results = ''
        for entity_id in light_features_by_ent_id.keys():
            light_features = light_features_by_ent_id[entity_id]
            prediction = None
            try:
                prediction = self.binary_classifier.predict(light_features)
            except ValueError as e:
                self.logger.warning(
                    'An exception occurred, could not predict, assuming 0.0. light_features = %s, err=%s',
                    str(light_features), str(e))

            if prediction is None:
                prediction = 0.0
            predictions_by_entity_id[entity_id] = prediction

            self.logger.info(
                'entity_id %d prediction %f binary_classifier_threshold=%f',
                entity_id, prediction, binary_classifier_threshold)

            if prediction > binary_classifier_threshold:  # binary_classifier_threshold appears to be a tuple... why?
                survivor_candidates.append(entity_id)
            golden_salience = 0
            if entity_id in golden_salience_by_entity_id:
                golden_salience = golden_salience_by_entity_id[entity_id]

            light_results = '{0}\n{1},{2},{3},{4},{5}'.format(
                light_results, str(optional_docid), str(entity_id),
                str(golden_salience), str(prediction), str(light_features))

        file = open(
            FileLocations.get_temp_path() + file_prefix +
            'light_output_partial.txt', "a")
        file.write(light_results)
        file.close()

        self.logger.info('Predictions %s', predictions_by_entity_id)
        self.logger.info('Survivor candidate entity_id are: %s ',
                         survivor_candidates)

        if len(survivor_candidates) < 1:
            self.logger.warning(
                "No candidates survived, passing first %d through",
                min_candidates_to_pass_through)
            for entity_id in light_features_by_ent_id.keys():
                if len(survivor_candidates) < min_candidates_to_pass_through:
                    survivor_candidates.append(entity_id)

        if self.heavy_feature_extractor is None:
            self.logger.warning(
                'Heavy extractor is None, not performing further processing.')
            return {}, {}, 0, []

        all_heavy_features_by_entity_id = self.heavy_feature_extractor.process(
            survivor_candidates,
            break_early=break_early,
            optional_docId=optional_docid)

        fname = title.replace(' ', '_')
        fname = fname.replace('.', '_')
        fname = fname.replace('"', '_')
        fname = fname.replace('\'', '_')
        fname = fname[0:50].lower()

        self.save_partial_results('all_heavy_features_by_entity_id_' + fname,
                                  all_heavy_features_by_entity_id)

        calculated_saliency_by_entity_id = {}
        output = ''
        for entity_id in all_heavy_features_by_entity_id.keys():
            if entity_id in golden_salience_by_entity_id:
                target_saliency = golden_salience_by_entity_id[entity_id]
            else:
                target_saliency = 0.0

            list_of_heavy_features = all_heavy_features_by_entity_id[entity_id]
            self.logger.info('Number of heavy features %d for entity_id %d ',
                             len(list_of_heavy_features), entity_id)

            calculated_saliency = 0.0
            try:
                pred_array = self.heavy_feature_regressor.predict(
                    np.array(list_of_heavy_features).reshape(1, -1))
                calculated_saliency = pred_array[0]
            except ValueError as e:
                self.logger.warning(
                    'could not calc gbrt, returning 0. entity_id=%d, x=%s, err=%s',
                    entity_id, list_of_heavy_features, e)
            except IndexError as e:
                self.logger.warning(
                    'could not calc gbrt, returning 0. entity_id=%d, x=%s, err=%s',
                    entity_id, list_of_heavy_features, e)

            self.logger.info(
                'calculated saliency for docid = %d entity_id = %d saliency = %f ',
                optional_docid, entity_id, calculated_saliency)

            calculated_saliency_by_entity_id[entity_id] = calculated_saliency

            output = '{0}{1},{2},{3},{4},{5}\n'.format(
                output, str(optional_docid), str(entity_id),
                str(target_saliency), str(calculated_saliency),
                str(all_heavy_features_by_entity_id[entity_id]))

        fn = FileLocations.get_temp_path(
        ) + file_prefix + 'heavy_output_partial.txt'
        self.logger.debug('Appending heavy parameters to %s', output)
        file = open(fn, "a")
        file.write(output)
        file.close()

        self.logger.info('\n%s', output)

        return calculated_saliency_by_entity_id, golden_salience_by_entity_id
Ejemplo n.º 11
0
    def main(self, from_, to_, measurement, pipeline_portion):

        # load the data
        dd = DatasetDexter()
        document_list = dd.get_dexter_dataset()

        # process the data
        count = 0

        slcs = SpotlightCachingSpotter()
        light_features_to_zero = []
        lfe = SELLightFeatureExtractor(light_features_to_zero)
        gbrt = None  # GBRT('fred')
        ndcg = NDCG()

        min_candidates_to_pass_through = 3
        binary_classifier_threshold = 0.5
        spotter_confidence = 0.5
        corpus_name = 'dexter_fset_02_'
        break_early = False

        file_prefix = (corpus_name + '_' + str(from_) + '_to_' + str(to_) +
                       '_')
        salience_by_entity_by_doc_id = {}
        time_by_docid = {}

        light_feature_filename = FileLocations.get_temp_path(
        ) + file_prefix + 'light_output_partial.txt'

        file = open(light_feature_filename, "a")
        file.write(
            '\ndocId, entity_id, golden_salience, estimated_salience, [light_features]'
        )
        file.close()

        for document in document_list:
            data = json.loads(document)
            docid = data['docId']

            if (count in range(from_, (to_ + 1)) and measurement == 'LINE') or \
                    (docid in range(from_, (to_ + 1)) and measurement == 'DOCID'):
                self.logger.info('_______________________________________')
                self.logger.info('Starting processing of docid = %d  line=%d ',
                                 docid, count)
                start_time = time.time()
                saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
                    data)
                body = self.extract_body(data)
                title = data['title']

                pipeline = Pipeline002(slcs, lfe, gbrt, ndcg,
                                       light_feature_filename)

                calculated_saliency_by_entity_id, golden_salience_by_entity_id, discount_sum, model_dcgs = \
                    pipeline.process_document(
                        docid,
                        body, title,
                        file_prefix, break_early=break_early,
                        golden_salience_by_entity_id=saliency_by_ent_id_golden,
                        min_candidates_to_pass_through=min_candidates_to_pass_through,
                        binary_classifier_threshold=binary_classifier_threshold,
                        spotter_confidence=spotter_confidence)

                salience_by_entity_by_doc_id[
                    docid] = calculated_saliency_by_entity_id
                self.logger.info('count = %d, docId = %d ', count, docid)
                self.logger.info('calculated_saliency_by_entity_id = %s ',
                                 str(calculated_saliency_by_entity_id))
                self.logger.info('discount_sum = %s ', str(discount_sum))
                self.logger.info('model_dcgs = %s ', str(model_dcgs))

                diff = time.time() - start_time

                time_by_docid[docid] = diff
                self.logger.info('Times taken %s', time_by_docid)
                self.logger.info('Time taken for docid=%d, time=%f', docid,
                                 diff)

            count += 1
        self.logger.info('Times taken by docid: %s', time_by_docid)

        trc = TrecReferenceCreator()
        trc.create_results_file(salience_by_entity_by_doc_id, 'x_temp')
        report, ndcg, p_at = trc.get_report(
            FileLocations.get_dropbox_intermediate_path() +
            'trec_ground_truth.txt', 'x_temp')
        self.logger.info(' Trec Eval Results:\n %s', report)
Ejemplo n.º 12
0
from sel.file_locations import FileLocations

# dense to sparse

# set up logging
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.propagate = False
logger.setLevel(logging.INFO)

if __name__ == "__main__":
    ds = WikipediaDataset()
    wikititle_marisa_trie = ds.get_wikititle_case_insensitive_marisa_trie()
    logger.info('Creating dictionary')
    wikititle_by_id = {}
    for k in wikititle_marisa_trie.keys():
        wid = wikititle_marisa_trie.get(k)[0][0]
        wikititle_by_id[wid] = k

    logger.info('complete')

    output_filename = FileLocations.get_dropbox_wikipedia_path(
    ) + 'wikititle_by_id.pickle'
    logger.info('About to write %s', output_filename)
    with open(output_filename, 'wb') as handle:
        pickle.dump(wikititle_by_id, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info('file written = %s', output_filename)