Ejemplo n.º 1
0
def main():
    db = PwdDb()
    tg = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags
    
    offset = 0
    size   = 1000000
    
    #output_file = open('../results/semantic/nouns/{0}_{1}.txt'.format(offset, size), 'wb')
    output_file = open('../results/pos/verbs/all.txt'.format(offset, size), 'wb')


    while (db.hasNext()):    
#    for i in range(offset,offset+size):
        words = db.nextPwd() # list of Words
        
        for w in words:
            if w.pos is None :
                continue
            wn_pos = tg.brownToWordNet(w.pos)
            if wn_pos == 'v':
                output_file.write(str(w.word) + '\n')
    
    db.finish()
    
    return 0
Ejemplo n.º 2
0
def main():
    
    db = PwdDb()
    tc = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags
    
    pos_dist    = dict() 
    wn_pos_dist = dict()
    
    fragments_total = 0  # number of words
    pos_total       = 0  # number of pos-tagged words
    wn_verbs_total  = 0  # number of verbs that are found in wordnet
    wn_nouns_total  = 0  # number of verbs that are found in wordnet
        
    while (db.hasNext()):
        words = db.nextPwd() # list of Words
        fragments_total += len(words)
        
        for w in words:
            if w.pos is None :
                continue
            pos_total += 1
            
            if w.pos in pos_dist :
                pos_dist[w.pos] += 1
            else : 
                pos_dist[w.pos] = 1
            
            wn_pos = tc.brownToWordNet(w.pos)
            
            if wn_pos in wn_pos_dist :
                wn_pos_dist[wn_pos] += 1
            else : 
                wn_pos_dist[wn_pos] = 1
                
            if w.synsets is not None:
                if wn_pos == 'v' :
                    wn_verbs_total += 1
                elif wn_pos == 'n' :
                    wn_nouns_total += 1
        
    db.finish()
    
    # convert to list of tuples so we can sort it by value
    pos_dist = pos_dist.items()
    pos_dist = sorted(pos_dist, key = lambda entry: entry[1], reverse=True)
    
    print "Total number of fragments: {}".format(fragments_total)
    print 'of which {} are POS tagged words ({}%)'.format(pos_total, float(pos_total)*100/fragments_total)
    print '\nPOS distribution (Brown tagset):\n'
    for k, v in pos_dist:
        print "{}\t{}".format(k, v)
    print '\nPOS distribution (WordNet tagset):\n', wn_pos_dist     
    print '\n{} verbs found in WordNet ({}% of verbs)'.format(wn_verbs_total, float(wn_verbs_total)*100/wn_pos_dist['v'])
    print '\n{} nouns found in WordNet ({}% of nouns)'.format(wn_nouns_total, float(wn_nouns_total)*100/wn_pos_dist['n'])
    
    return 0
Ejemplo n.º 3
0
in wordnet. In addition I wanted to see the impact on the pos-tagging of the
passwords and review my mapping brown -> wordnet.

Created on 2013-03-12

@author: rafa
'''

from database import PwdDb
from tagset_conversion import TagsetConverter

if __name__ == '__main__':
    db = PwdDb()
    tagconverter = TagsetConverter()
    
    # tags not convered by wordnet 
    notcovered = dict()
    
    while db.hasNext() :
        p = db.nextPwd()
        
        for w in p :
            
            if w.pos is not None and tagconverter.brownToWordNet(w.pos) is None :
                freq = notcovered[w.pos] if w.pos in notcovered else 0 
                notcovered[w.pos] = freq + 1
    
    db.finish()
    
    print notcovered