Ejemplo n.º 1
0
def update_wordcount(word_fd, label_word_fd, handle, label):
    print "Counting '%s'" % (label)
    print datetime.datetime.now()
    for line in handle:
        for word in words_in_tweet(line):
            word_fd.inc(word)
            label_word_fd[label].inc(word)
    handle.seek(0)
Ejemplo n.º 2
0
def update_wordcount(word_fd, label_word_fd, handle, label):
    print "Counting '%s'" % (label)
    print datetime.datetime.now()
    for line in handle:
        for word in words_in_tweet(line):
            word_fd.inc(word)
            label_word_fd[label].inc(word)
    handle.seek(0)
Ejemplo n.º 3
0
def build_csv(vocab, pos_tweets, negative_tweets, output_csv_file):
    nFeature = len(vocab)
    dataset = []

    for (label, tweets) in [(1, pos_tweets), (0, negative_tweets)]:
        for line in tweets:
            features = [0] * nFeature
            for word in words_in_tweet(line):
                if vocab.has_key(
                        word
                ):  # it may not be in the vocab beucase of its low frequency overall
                    features[vocab[word]] = features[vocab[word]] + 1
            dataset.append((label, line, features))
    random.shuffle(dataset)
    fd = open(output_csv_file + '.vis', 'w')
    fdet = open(output_csv_file + '.details', 'w')
    fdesc = open(output_csv_file + '.desc', 'w')
    fdict = open(output_csv_file + '.dict', 'w')

    fdesc.write('We transformed ' + str(len(dataset)) + ' tweets, ' +
                str(len(pos_tweets)) + ' postives and ' +
                str(len(negative_tweets)) +
                ' negative tweets. Total number of features = ' +
                str(nFeature) + '\n\n')
    fdesc.write('Format of ' + output_csv_file + '.vis file is as follows:\n')
    fdesc.write('PrimaryKey, realLabel, feature1, ....feature' +
                str(nFeature) + '\n\n')
    fdesc.write('Format of ' + output_csv_file +
                '.details file is as follows:\n')
    fdesc.write('PrimaryKey, realLabel, actual_tweat\n\n')
    fdesc.write('Format of ' + output_csv_file + '.dict file is as follows:\n')
    fdesc.write('NumberOfTheFeature, CorrespondingWord\n\n')
    fdesc.close()

    for word, index in vocab.iteritems():
        fdict.write(str(index) + ':' + word + '\n')
    fdict.close()

    for pk in range(len(dataset)):
        (label, tweet, features) = dataset[pk]
        entries = map(str, features)
        fd.write(','.join([str(pk + 1), str(label)]) + ',' +
                 ','.join(entries) + '\n')
        fdet.write(str(pk + 1) + ',' + str(label) + ',' + tweet)
    fd.close()
    fdet.close()
    print len(
        dataset), ' total lines were transformed into ', nFeature, ' features'
def build_csv(vocab, pos_tweets, negative_tweets, output_csv_file):
    nFeature = len(vocab)
    dataset = []

    for (label, tweets) in [(1, pos_tweets), (0, negative_tweets)]:
        for line in tweets:
            features = [0] * nFeature
            for word in words_in_tweet(line):
                if vocab.has_key(word): # it may not be in the vocab beucase of its low frequency overall
                    features[vocab[word]] = features[vocab[word]]+1
            dataset.append((label, line, features))
    random.shuffle(dataset)
    fd = open(output_csv_file+'.vis', 'w')
    fdet = open(output_csv_file+'.details', 'w')
    fdesc = open(output_csv_file+'.desc', 'w')
    fdict = open(output_csv_file+'.dict', 'w')
    
    fdesc.write('We transformed '+str(len(dataset))+' tweets, '+str(len(pos_tweets))+' postives and '+
                str(len(negative_tweets))+' negative tweets. Total number of features = '+str(nFeature)+'\n\n')
    fdesc.write('Format of '+output_csv_file+'.vis file is as follows:\n')
    fdesc.write('PrimaryKey, realLabel, feature1, ....feature'+str(nFeature)+'\n\n')
    fdesc.write('Format of '+output_csv_file+'.details file is as follows:\n')
    fdesc.write('PrimaryKey, realLabel, actual_tweat\n\n')
    fdesc.write('Format of '+output_csv_file+'.dict file is as follows:\n')
    fdesc.write('NumberOfTheFeature, CorrespondingWord\n\n')
    fdesc.close()
    
    for word, index in vocab.iteritems():
        fdict.write(str(index) + ':'+ word + '\n')
    fdict.close()
    
    for pk in range(len(dataset)):
        (label, tweet, features) = dataset[pk] 
        entries = map(str, features)
        fd.write(','.join([str(pk+1),str(label)])+','+','.join(entries)+'\n')
        fdet.write(str(pk+1)+','+str(label)+','+tweet)
    fd.close()
    fdet.close()
    print len(dataset), ' total lines were transformed into ', nFeature, ' features'
def build_vocab(min_word_freq, positive_file, negative_file, nPos, nNeg):
    positive_handle = open(positive_file, 'r')
    negative_handle = open(negative_file, 'r')
    groups = [(1, positive_handle, nPos), (0, negative_handle, nNeg)]
    
    vocab = {}               
    for (curLabel, handle, limit) in groups:
        tweetsRead = 0;
        tweets = []
        for line in handle:
            if tweetsRead >= limit:
                break
            else:
                tweetsRead = tweetsRead + 1
                tweets.append(line)
                for word in words_in_tweet(line):
                    if vocab.has_key(word):
                        vocab[word] = vocab[word] + 1
                    else:
                        vocab[word] = 1;
        if tweetsRead != limit:
            print '***Warning: you requested ', limit, ' instances for label ', curLabel, ' but we only found ', tweetsRead, ' tweets with that label'
        handle.close()
        if curLabel == 0:
            negative_tweets = tweets
        elif curLabel == 1:
            positive_tweets = tweets
        
    wordId = {}
    nextId = 0
    for word in vocab.keys():
        if vocab[word] < min_word_freq:
            continue
        wordId[word] = nextId
        nextId = nextId +1
    return (wordId, positive_tweets, negative_tweets)
Ejemplo n.º 6
0
def build_vocab(min_word_freq, positive_file, negative_file, nPos, nNeg):
    positive_handle = open(positive_file, 'r')
    negative_handle = open(negative_file, 'r')
    groups = [(1, positive_handle, nPos), (0, negative_handle, nNeg)]

    vocab = {}
    for (curLabel, handle, limit) in groups:
        tweetsRead = 0
        tweets = []
        for line in handle:
            if tweetsRead >= limit:
                break
            else:
                tweetsRead = tweetsRead + 1
                tweets.append(line)
                for word in words_in_tweet(line):
                    if vocab.has_key(word):
                        vocab[word] = vocab[word] + 1
                    else:
                        vocab[word] = 1
        if tweetsRead != limit:
            print '***Warning: you requested ', limit, ' instances for label ', curLabel, ' but we only found ', tweetsRead, ' tweets with that label'
        handle.close()
        if curLabel == 0:
            negative_tweets = tweets
        elif curLabel == 1:
            positive_tweets = tweets

    wordId = {}
    nextId = 0
    for word in vocab.keys():
        if vocab[word] < min_word_freq:
            continue
        wordId[word] = nextId
        nextId = nextId + 1
    return (wordId, positive_tweets, negative_tweets)
Ejemplo n.º 7
0
def features(feat_func, handle, label):
    print "Generating features for '%s'" % (label)
    print datetime.datetime.now()
    return [((feat_func(words_in_tweet(line))), label) for line in handle]
Ejemplo n.º 8
0
    r = start
    while r <= stop:
        yield r
        r += step

testfile = open('testdata.manual.2009.05.25')

print "Loading classifier"
classifier = load_classifier()

print "Running test"
print "prob, pos prec, pos rec, neg prec, neg rec"
for prob in drange(0.5, 1.0, .01):
    right = 0
    wrong = 0
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    testfile.seek(0)
    count = 0
    for line in testfile:
        parts = line.split(";;")
        dist = classifier.prob_classify(word_feats(words_in_tweet(parts[5])))
        if dist.prob(dist.max()) > prob:
            realguess = dist.max()
        else:
            realguess = NEUTRAL
        refsets[parts[0]].add(count)
        testsets[realguess].add(count)
        count += 1
    print "%f, %f, %f, %f, %f" % (prob, nltk.metrics.precision(refsets[POSITIVE], testsets[POSITIVE]), nltk.metrics.recall(refsets[POSITIVE],testsets[POSITIVE]), nltk.metrics.precision(refsets[NEGATIVE], testsets[NEGATIVE]), nltk.metrics.recall(refsets[NEGATIVE], testsets[NEGATIVE]))
Ejemplo n.º 9
0
def features(feat_func, handle, label):
    print "Generating features for '%s'" % (label)
    print datetime.datetime.now()
    return [((feat_func(words_in_tweet(line))), label) for line in handle]
Ejemplo n.º 10
0
testfile = open('testdata.manual.2009.05.25')

print "Loading classifier"
classifier = load_classifier()

print "Running test"
print "prob, pos prec, pos rec, neg prec, neg rec"
for prob in drange(0.5, 1.0, .01):
    right = 0
    wrong = 0
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    testfile.seek(0)
    count = 0
    for line in testfile:
        parts = line.split(";;")
        dist = classifier.prob_classify(word_feats(words_in_tweet(parts[5])))
        if dist.prob(dist.max()) > prob:
            realguess = dist.max()
        else:
            realguess = NEUTRAL
        refsets[parts[0]].add(count)
        testsets[realguess].add(count)
        count += 1
    print "%f, %f, %f, %f, %f" % (
        prob, nltk.metrics.precision(refsets[POSITIVE], testsets[POSITIVE]),
        nltk.metrics.recall(refsets[POSITIVE], testsets[POSITIVE]),
        nltk.metrics.precision(refsets[NEGATIVE], testsets[NEGATIVE]),
        nltk.metrics.recall(refsets[NEGATIVE], testsets[NEGATIVE]))