Beispiel #1
0
def main():
    db = PwdDb()
    tg = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags
    
    offset = 0
    size   = 1000000
    
    #output_file = open('../results/semantic/nouns/{0}_{1}.txt'.format(offset, size), 'wb')
    output_file = open('../results/pos/verbs/all.txt'.format(offset, size), 'wb')


    while (db.hasNext()):    
#    for i in range(offset,offset+size):
        words = db.nextPwd() # list of Words
        
        for w in words:
            if w.pos is None :
                continue
            wn_pos = tg.brownToWordNet(w.pos)
            if wn_pos == 'v':
                output_file.write(str(w.word) + '\n')
    
    db.finish()
    
    return 0
Beispiel #2
0
def main():
    
    db = PwdDb()
    tc = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags
    
    pos_dist    = dict() 
    wn_pos_dist = dict()
    
    fragments_total = 0  # number of words
    pos_total       = 0  # number of pos-tagged words
    wn_verbs_total  = 0  # number of verbs that are found in wordnet
    wn_nouns_total  = 0  # number of verbs that are found in wordnet
        
    while (db.hasNext()):
        words = db.nextPwd() # list of Words
        fragments_total += len(words)
        
        for w in words:
            if w.pos is None :
                continue
            pos_total += 1
            
            if w.pos in pos_dist :
                pos_dist[w.pos] += 1
            else : 
                pos_dist[w.pos] = 1
            
            wn_pos = tc.brownToWordNet(w.pos)
            
            if wn_pos in wn_pos_dist :
                wn_pos_dist[wn_pos] += 1
            else : 
                wn_pos_dist[wn_pos] = 1
                
            if w.synsets is not None:
                if wn_pos == 'v' :
                    wn_verbs_total += 1
                elif wn_pos == 'n' :
                    wn_nouns_total += 1
        
    db.finish()
    
    # convert to list of tuples so we can sort it by value
    pos_dist = pos_dist.items()
    pos_dist = sorted(pos_dist, key = lambda entry: entry[1], reverse=True)
    
    print "Total number of fragments: {}".format(fragments_total)
    print 'of which {} are POS tagged words ({}%)'.format(pos_total, float(pos_total)*100/fragments_total)
    print '\nPOS distribution (Brown tagset):\n'
    for k, v in pos_dist:
        print "{}\t{}".format(k, v)
    print '\nPOS distribution (WordNet tagset):\n', wn_pos_dist     
    print '\n{} verbs found in WordNet ({}% of verbs)'.format(wn_verbs_total, float(wn_verbs_total)*100/wn_pos_dist['v'])
    print '\n{} nouns found in WordNet ({}% of nouns)'.format(wn_nouns_total, float(wn_nouns_total)*100/wn_pos_dist['n'])
    
    return 0
Beispiel #3
0
def main():
    """ Tags the passwords by semantic categories,
        assuming it's already pos and sentiment-tagged.
        It doesn't need the sentiment info, but it gets the
        synset # that was gotten in the sentiment tagging
        process, to reduce overhead. """
    
    db = PwdDb()
    tagger = SemanticTagger()
    tg = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags
    
    print "tagging process initialized..."
    start = time()
    
    csv_writer = csv.writer(open("../results/semantic/test.csv","wb"), dialect='excel')
    
#    while (db.hasNext()):
    for i in range(1,100001):
        words = db.nextPwd() # list of Words
        for w in words:
            t = None
            
            # if there's a synset for this word
            if w.synsets is not None:
                wn_pos = tg.brownToWordNet(w.pos)
                t = tagger.tag(w.word, wn_pos, w.synsets)
            else:
                t = tagger.tag(w.word, w.pos)
        
            w.category = t
            db.saveCategory(w)
            csv_writer.writerow([i, w.word, w.category, w.senti, w.pos])

    db.finish() 
    
    print "tagging process took " + str(time()-start) + " seconds."      
    return 0;
Beispiel #4
0
'''
I wrote this just to verify what tags in the Brown corpus have no correspondent
in wordnet. In addition I wanted to see the impact on the pos-tagging of the
passwords and review my mapping brown -> wordnet.

Created on 2013-03-12

@author: rafa
'''

from database import PwdDb
from tagset_conversion import TagsetConverter

if __name__ == '__main__':
    db = PwdDb()
    tagconverter = TagsetConverter()
    
    # tags not convered by wordnet 
    notcovered = dict()
    
    while db.hasNext() :
        p = db.nextPwd()
        
        for w in p :
            
            if w.pos is not None and tagconverter.brownToWordNet(w.pos) is None :
                freq = notcovered[w.pos] if w.pos in notcovered else 0 
                notcovered[w.pos] = freq + 1
    
    db.finish()
    
Beispiel #5
0
from database import PwdDb
import re

if __name__ == "__main__":
    db = PwdDb()

    histogram = [0] * 10

    alpha_regex = r"[a-zA-Z]"

    #     for i in range(10000):
    while db.hasNext():
        fragments = db.nextPwd()

        #         password = fragments[0].password
        # reduces password to its alphabetic chars
        password = "".join(re.findall(alpha_regex, fragments[0].password))

        # WE'RE SKIPPING PASSWORDS THAT DO NOT CONTAIN ALPHABETIC CHARACTERS
        if not password:
            continue
        #         if not re.findall(alpha_regex, password):
        #             continue

        nongap = "".join([f.word for f in fragments if not f.is_gap()])

        coverage = float(len(nongap)) / len(password)

        if coverage >= 1:
            histogram[9] += 1
        else: