def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        ## load entities from json file
        log("loading entity list")
        entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json"))
        self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
Exemple #2
0
## get our filter algorithm
import toy_kba_algorithm

## load entities
filter_topics = json.load(open(args.entities))

## set the topic set identifier in filter_run
filter_run["topic_set_id"] = filter_topics["topic_set_id"]

## init our toy algorithm
entities = filter_topics["targets"]
if args.recall_filters:
    recall_filters = json.load(open(args.recall_filters))
else:
    recall_filters = {}
entity_representations = toy_kba_algorithm.prepare_entities(entities, recall_filters)
logger.info( json.dumps(entity_representations, indent=4, sort_keys=True) )

## set the corpus identifier in filter_run
corpus_id_parts = args.corpus.split("/")
filter_run["corpus_id"] = corpus_id_parts[-1] or corpus_id_parts[-2]

## prepare to iterate over all hours in corpus in chronological order
if args.date_hour:
    ## for parallel mode, we read a single date_hour dir from this
    ## argument
    date_hour_list = [args.date_hour]
    print_comments = False

else:
    date_hour_list = os.listdir(args.corpus)
    for slot_name, values in data['slots'].iteritems():
        if slot_name.isupper() and args.mode == 'slots':
            for val in values:
                recall_filters[target_id].append(val['value'])
        elif args.mode == 'simple' and slot_name == 'canonical_name':                
            recall_filters[target_id].append(values)
            recall_filters[target_id] += values.split()

print recall_filters

slot_names = {}
if args.slot_names:
    slot_names = json.load(open(args.slot_names))

entity_representations = toy_kba_algorithm.prepare_entities(
    entities, recall_filters=recall_filters, 
    slot_names=slot_names,
)
logger.info( json.dumps(entity_representations, indent=4, sort_keys=True) )

## set the corpus identifier in filter_run
corpus_id_parts = args.corpus.split("/")
filter_run["corpus_id"] = corpus_id_parts[-1] or corpus_id_parts[-2]

## store some non-required run info of our own design to the
## filter_run dict to store in our submission... not too much, just a
## bit of context for humans.
filter_run["run_info"] = {
    "num_entities": len(entities),
    }

print_comments = False