def all_promoted_patterns(): patterns_query = db_onto.find(projection={ 'promoted_patterns': True, '_id': False }) patterns = list(flatten([list(d.values()) for d in list(patterns_query)])) df_patterns = pd.DataFrame( list( db_ap.find({ "ctx_pattern": { "$in": [p for p in patterns] }, "counter": { "$gt": 1 } }))) return df_patterns
def all_promoted_instances(): instances_query = db_onto.find(projection={ 'promoted_instances': True, '_id': False }) instances = list(flatten([list(d.values()) for d in list(instances_query)])) df_instances = pd.DataFrame( list( db_ap.find({ "noun_phrase": { "$in": [i for i in instances] }, "counter": { "$gt": 1 } }))) return df_instances
def promote_instances(category, iteration, max_promotions, limit, T, df_all_promoted_patterns): last_promoted_patterns = category['promoted_patterns'][iteration - 1] promoted_instances = list(flatten(category['promoted_instances'])) promoted_patterns = list(flatten(category['promoted_patterns'])) #get the promoted patterns of the mutex exception categories mutex_query = (db_onto.find( {'category_name': { "$in": category['mutex_exceptions'] }})) mutex_patterns = list( flatten([i['promoted_patterns'] for i in mutex_query])) #extraction step #count the ocurrences of instances that co-occur with the #positive promoted patterns in the last iteration #without considering instances that were already promoted pos = list( db_ap.find({ "ctx_pattern": { "$in": last_promoted_patterns }, "noun_phrase": { "$nin": promoted_instances }, "counter": { "$gt": 1 } })) if (pos): #if at least one positive and one negative pattern was found df_pos = (pd.DataFrame(pos).groupby('noun_phrase')['counter'].sum(). sort_values(ascending=False).head(limit).rename('count_pos')) #count the ocurrences of instances that co-occur with negative patterns df_neg = ( df_all_promoted_patterns[~df_all_promoted_patterns['ctx_pattern']. isin(promoted_patterns + mutex_patterns)]. groupby('noun_phrase')['counter'].sum().rename('count_neg')) joined = ( pd.concat([df_pos, df_neg], axis=1, sort=False).fillna(0).assign( filter_check=lambda df: (df['count_pos'] >= df['count_neg'] * T ) & # filter criterion #1 (df['count_pos'] >= 2))) # filter criterion #2 new_instances = list(( joined[joined['filter_check']] # filter step .sort_values(by='count_pos', ascending=False) # rank step .head(max_promotions) # promote step .index.values)) #update ontology with the promoted instances db_onto.update_one( {'category_name': category['category_name']}, {'$set': { 'promoted_instances.' + str(iteration): new_instances }}) else: new_instances = [] return new_instances
def main(): num_iter = cpl_conf.num_iter # number of iterations max_p = cpl_conf.max_p_promotions # max pattern promotions per iteration max_i = cpl_conf.max_i_promotions # max instance promotions per iteration l = cpl_conf.limit # max number of positive candidates for promotion T = cpl_conf.T # multiplier of promotion threshold #load category metadata categories_init = db_onto.find( projection=['category_name', 'seed_instances', 'seed_ctx_pattern']) for i in range(num_iter): # for i iterations i_start_time = time.time() df_all_promoted_instances = pr.all_promoted_instances() df_all_promoted_patterns = pr.all_promoted_patterns() categories_init.rewind() if (i == 0): #if first iteration pr.promote_seeds(categories_init) else: for c_init in categories_init: # for all categories #load category information category = db_onto.find_one( {'category_name': c_init['category_name']}) if i <= len( category['promoted_patterns'] ): #if there are positive patterns for this iteration start = time.time() pi = pr.promote_instances(category, i, max_i, l, T, df_all_promoted_patterns) end = time.time() print('instance', i, c_init['category_name'], len(pi), end - start, sep=',') if i <= len( category['promoted_instances'] ): #if there are positive patterns for this iteration start = time.time() pp = pr.promote_patterns(category, i, max_p, l, T, df_all_promoted_instances) end = time.time() print('pattern', i, c_init['category_name'], len(pp), end - start, sep=',') i_end_time = time.time() db.close()