Ejemplo n.º 1
0
def run():
    rows = csv_util.query_csv_for_rows('labeled_data/entities.csv')
    labeled_entities = set()
    resolved_entities = set()
    for row in rows:
        entity_id = row[0]+'_'+row[3] # (i.e., "surfaceform_shorttext")
        
        label = row[2]
        if label == 'Y':
            labeled_entities.add(entity_id)
            resolved_entities.add(entity_id)
        elif label=='N':
            labeled_entities.add(entity_id)
            
    print str(len(labeled_entities))+" annotated entities."
    print str(len(resolved_entities))+" unanimously annotated entities."
Ejemplo n.º 2
0
def run():
    rows = csv_util.query_csv_for_rows('labeled_data/entities.csv')
    labeled_entities = set()
    resolved_entities = set()
    for row in rows:
        entity_id = row[0] + '_' + row[3]  # (i.e., "surfaceform_shorttext")

        label = row[2]
        if label == 'Y':
            labeled_entities.add(entity_id)
            resolved_entities.add(entity_id)
        elif label == 'N':
            labeled_entities.add(entity_id)

    print str(len(labeled_entities)) + " annotated entities."
    print str(len(resolved_entities)) + " unanimously annotated entities."
Ejemplo n.º 3
0
def get_resolved_ambiguous_entities():
    """ Returns the ambiguous entities for which the intended 
    meaning has been unanimously resolved by human annotators. """

    all_entities = defaultdict(list)
    correct_meaning_label = "Y"

    row_count = -1
    labeled_entities_dataset = csv_util.query_csv_for_rows("labeled_data/entities.csv", False)
    for candidate_row in labeled_entities_dataset:
        row_count = row_count + 1
        if row_count == 0:
            # header row
            surfaceform_col = candidate_row.index("surface_form")
            shorttext_col = candidate_row.index("short_text")

            candidate_meaning_col = candidate_row.index("candidate_meaning")
            candidate_label_col = candidate_row.index("candidate_is_relevant")

            userkey_col = candidate_row.index("user_key")
            continue

        # use "surfaceform_shorttext" as ID for entity
        surfaceform = candidate_row[surfaceform_col]
        shorttext = candidate_row[shorttext_col]
        entity_id = surfaceform + "_" + shorttext

        meaning = candidate_row[candidate_meaning_col]
        label = candidate_row[candidate_label_col]
        userkey = candidate_row[userkey_col]
        all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey))

    # test if entity is ambiguous (i.e., has more than one candidate meaning) and
    # if so if entity has been resolved (i.e., has at least one candidate labeled
    # as the intended meaning)
    resolved_entities = {}
    for entity in all_entities:
        entity_tuple_list = all_entities[entity]
        if len(entity_tuple_list) < 2:
            continue

        candidate_meanings = []
        intended_meanings = []
        user = None
        for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list:

            # title of a potential meaning of the ambiguous entity
            if not meaning in candidate_meanings:
                candidate_meanings.append(meaning)

            # annotated label indicating whether this candidate
            # meaning is the intended meaning of the entity
            if label == correct_meaning_label and not meaning in intended_meanings:
                intended_meanings.append(meaning)

            if user is None:
                user = userkey
        if len(intended_meanings) > 1 and len(intended_meanings) > 0 and user != None:
            # this entity is ambiguous, has been manually resolved,
            # and we know the user who wrote it
            entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user)
            entity_id = entity_obj.get_id()
            resolved_entities[entity_id] = entity_obj
    return resolved_entities
Ejemplo n.º 4
0
def get_bridged_usernames():
    usernames = {}
    userhashes = csv_util.query_csv_for_rows("labeled_data/user_privacy/anonymized_userhash.csv")
    for (userkey, username) in userhashes:
        usernames[userkey] = username
Ejemplo n.º 5
0
def get_resolved_ambiguous_entities():
    ''' Returns the ambiguous entities for which the intended 
    meaning has been unanimously resolved by human annotators. '''
    
    all_entities = defaultdict(list)
    correct_meaning_label = 'Y'
    
    row_count = -1
    labeled_entities_dataset = csv_util.query_csv_for_rows('labeled_data/entities.csv', False)
    for candidate_row in labeled_entities_dataset:
        row_count = row_count+1
        if row_count==0:
            # header row
            surfaceform_col = candidate_row.index('surface_form')
            shorttext_col = candidate_row.index('short_text')
            
            candidate_meaning_col = candidate_row.index('candidate_meaning')
            candidate_label_col = candidate_row.index('candidate_is_relevant')
            
            userkey_col = candidate_row.index('user_key')
            continue
        
        # use "surfaceform_shorttext" as ID for entity
        surfaceform = candidate_row[surfaceform_col]
        shorttext = candidate_row[shorttext_col]
        entity_id = surfaceform+'_'+shorttext
        
        meaning = candidate_row[candidate_meaning_col]
        label = candidate_row[candidate_label_col] 
        userkey = candidate_row[userkey_col]
        all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey))
        
    # test if entity is ambiguous (i.e., has more than one candidate meaning) and
    # if so if entity has been resolved (i.e., has at least one candidate labeled
    # as the intended meaning)
    resolved_entities = {}
    for entity in all_entities:
        entity_tuple_list = all_entities[entity]
        if len(entity_tuple_list) < 2:
            continue 
        
        candidate_meanings = []
        intended_meanings = []
        user = None
        for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list:
            
            # title of a potential meaning of the ambiguous entity
            if not meaning in candidate_meanings:
                candidate_meanings.append(meaning)
            
            # annotated label indicating whether this candidate 
            # meaning is the intended meaning of the entity
            if label==correct_meaning_label and not meaning in intended_meanings:
                intended_meanings.append(meaning)
            
            if user is None:
                user = userkey
        if len(intended_meanings)>1 and len(intended_meanings)>0 and user!=None:
            # this entity is ambiguous, has been manually resolved, 
            # and we know the user who wrote it
            entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user)
            entity_id = entity_obj.get_id()
            resolved_entities[entity_id] = entity_obj
    return resolved_entities
Ejemplo n.º 6
0
def get_bridged_usernames():
        usernames = {}
        userhashes = csv_util.query_csv_for_rows('labeled_data/user_privacy/anonymized_userhash.csv')
        for (userkey, username) in userhashes:
            usernames[userkey] = username
Ejemplo n.º 7
0
def run():
    username_rows = csv_util.query_csv_for_rows('labeled_data/user_identity.csv')
    
    total_flickr = 0
    total_twitter = 0
    total_youtube = 0
    
    exists_flickrs = 0
    exists_twitters = 0
    exists_youtubes = 0
    
    FP_Flickr = 0
    FP_Twitter = 0
    FP_Youtube = 0    
    
    TP_Flickr = 0
    TP_Twitter = 0
    TP_Youtube = 0   
    
    for row in username_rows:
        #username = row[0]
        
        flickr_label = row[1]
        twitter_label = row[2]
        youtube_label = row[3]
        
        exists_wikipedia = row[4]
        
        exists_flickr = row[5]
        exists_twitter = row[6]
        exists_youtube = row[7]
        
        if exists_flickr=='TRUE' and exists_wikipedia=='TRUE':
            exists_flickrs+=1
        if exists_twitter=='TRUE' and exists_wikipedia=='TRUE':
            exists_twitters+=1
        if exists_youtube=='TRUE' and exists_wikipedia=='TRUE':
            exists_youtubes+=1
        
        if flickr_label!='':
            total_flickr+=1
        if twitter_label!='':
            total_twitter+=1
        if youtube_label!='':
            total_youtube+=1        
        
        if flickr_label=='TP':
            TP_Flickr+=1
        if twitter_label=='TP':
            TP_Twitter+=1                                            
        if youtube_label=='TP':
            TP_Youtube+=1                                                   
        
        if flickr_label=='FP':
            FP_Flickr+=1
        if twitter_label=='FP':
            FP_Twitter+=1                                            
        if youtube_label=='FP':
            FP_Youtube+=1   
            
    print "Table 3"
    print "Initial Sample Twitter: "+str(total_twitter)
    print "Reused: "+str(exists_twitters)+" "+str(100*float(exists_twitters)/total_twitter)
    print " "
    
    print "Initial Sample YouTube: "+str(total_youtube)
    print "Reused: "+str(exists_youtubes)+" "+str(100*float(exists_youtubes)/total_youtube)
    print " "
    
    print "Initial Sample Flickr: "+str(total_flickr)
    print "Reused: "+str(exists_flickrs)+" "+str(100*float(exists_flickrs)/total_flickr)
    print  " "
    
    print "Twitter Bridged: "+str(TP_Twitter)+" "+str(100*float(TP_Twitter)/(TP_Twitter+FP_Twitter))
    print "Youtube Bridged: "+str(TP_Youtube)+" "+str(100*float(TP_Youtube)/(TP_Youtube+FP_Youtube))
    print "Flickr Bridged: "+str(TP_Flickr)+" "+str(100*float(TP_Flickr)/(TP_Flickr+FP_Flickr))