コード例 #1
0
def main():
    binary_label = True
    exclude_stopwords = True
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finemedium.txt', exclude_stopwords, binary_label))
    data = []
    for datum in data_nosw:
        new_datum = datum[0].split()
        new_datum.append(datum[1])
        data.append(new_datum)
    #print data
    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    positive_words = [x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]]

    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    negative_words = [x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]]
    
    all_words = positive_words
    all_words.extend(x for x in negative_words if x not in positive_words)

    length = len(data)
    train_data = data[:int(length*.8)]
    test_data = data[int(length*.8):]
    
    decision_tree = dt.train(train_data, all_words)
    test_results = dt.test(decision_tree, test_data)
    print test_results
コード例 #2
0
def main():
    binary_label = True
    exclude_stopwords = True
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finefoods.txt', exclude_stopwords, binary_label))
    
    data = []
    for datum in data_nosw:
        new_datum = datum[0].split()
        new_datum.append(datum[1])
        data.append(new_datum)
        
    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    positive_words = [x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]]

    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    negative_words = [x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]]
    
    all_words = positive_words
    all_words.extend(x for x in negative_words if x not in positive_words)

    length = len(data)
    train_data = data[:int(length*.8)]
    test_data = data[int(length*.8):]
    
    decision_tree = dt.train(train_data, all_words)
    test_results = dt.test(decision_tree, test_data)
    print test_results
コード例 #3
0
def main():
    # default is binary outcome and no stopwords
    binary_label = True
    exclude_stopwords = True
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan(
        'finefoods.txt', exclude_stopwords, binary_label))

    # format data into 2 dimensional array
    data = []
    for datum in data_nosw:
        # first part of array is each review with words splitted
        new_datum = datum[0].split()
        # last item in each array is the label
        new_datum.append(datum[1])
        data.append(new_datum)

    # get a list of 500 most frequent positive words, ignoring the <br>
    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    positive_words = [
        x[0] for x in sorted(dict_positive_nosw.items(),
                             key=operator.itemgetter(1),
                             reverse=True)[1:501]
    ]

    # get a list of 500 most frequent negative words, ignoring the <br>
    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    negative_words = [
        x[0] for x in sorted(dict_negative_nosw.items(),
                             key=operator.itemgetter(1),
                             reverse=True)[1:501]
    ]

    # create non duplicate list of all frequent words from the two lists
    all_words = positive_words
    all_words.extend(x for x in negative_words if x not in positive_words)

    # split training and testing data
    length = len(data)
    train_data = data[:int(length * .8)]
    test_data = data[int(length * .8):]

    # using a dicision tree utilizing dictionaries
    decision_tree_dict = dtd.train(train_data, all_words)
    test_results_dict = dtd.test(decision_tree_dict, test_data)
    print test_results_dict

    # the same disicion tree utilizing binary tree
    decision_tree = dt.train(train_data, all_words)
    test_results = dt.test(decision_tree, test_data)
    print test_results
コード例 #4
0
ファイル: unigrams.py プロジェクト: od0/HW2
 def __init__(self, datum):
     unigrams = utils.get_unigram(datum[config.REVIEW_INDEX])
     self.unigrams = unigrams[config.UNIGRAMS_INDEX]
     self.word_set = frozenset(self.unigrams.keys())
     self.word_count = unigrams[config.WORD_COUNT_INDEX]
     self.rating = datum[config.SCORE_INDEX]
     self.predicted_rating = None
コード例 #5
0
def main():
    binary_label = True
    exclude_stopwords = False
    data = scan.scan('fineshort.txt', exclude_stopwords, binary_label)
    length = len(data)
    all_review = ' '.join([row[0] for row in data])
    dict_all = utils.get_unigram(all_review)[0]
    print(sorted(dict_all.items(), key=operator.itemgetter(1)))
    train_data = data[:int(length*.8)]
    test_data = data[int(length*.8):]
コード例 #6
0
def main():
    # default is binary outcome and no stopwords
    binary_label = True
    exclude_stopwords = True
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finefoods.txt', exclude_stopwords, binary_label))
    
    # format data into 2 dimensional array
    data = []
    for datum in data_nosw:
        # first part of array is each review with words splitted
        new_datum = datum[0].split()
        # last item in each array is the label
        new_datum.append(datum[1])
        data.append(new_datum)
    
    # get a list of 500 most frequent positive words, ignoring the <br>
    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    positive_words = [x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]]

    # get a list of 500 most frequent negative words, ignoring the <br>
    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    negative_words = [x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]]
    
    # create non duplicate list of all frequent words from the two lists
    all_words = positive_words
    all_words.extend(x for x in negative_words if x not in positive_words)

    # split training and testing data
    length = len(data)
    train_data = data[:int(length*.8)]
    test_data = data[int(length*.8):]
    
    # using a dicision tree utilizing dictionaries
    decision_tree_dict = dtd.train(train_data, all_words)
    test_results_dict = dtd.test(decision_tree_dict, test_data)
    print test_results_dict

    # the same disicion tree utilizing binary tree
    decision_tree = dt.train(train_data, all_words)
    test_results = dt.test(decision_tree, test_data)
    print test_results
コード例 #7
0
def main():
    binary_label = True
    exclude_stopwords = False
    data = scan.scan('fineshort.txt', exclude_stopwords, binary_label)
    length = len(data)
    all_review = ' '.join([row[0] for row in data])
    dict_all = utils.get_unigram(all_review)[0]
    print(sorted(dict_all.items(), key=operator.itemgetter(1)))
    train_data = data[:int(length * .8)]
    test_data = data[int(length * .8):]
コード例 #8
0
def main():
    # defaults are binary outcome and exclude stopwords
    binary_label = True
    exclude_stopwords = True
    
    # a modified scan that returns three lists
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finefoods.txt', exclude_stopwords, binary_label))
    
    # open files for writing
    top_all_nosw = open(os.path.join(sys.path[0], "top_all_nosw.txt"), "wb")
    top_positive_nosw = open(os.path.join(sys.path[0], "top_positive_nosw.txt"), "wb")
    top_negative_nosw = open(os.path.join(sys.path[0], "top_negative_nosw.txt"), "wb")
    
    # join all reviews for wordcount
    all_review_nosw = ' '.join([row[0] for row in data_nosw])
    dict_all_nosw = utils.get_unigram(all_review_nosw)[0]
    # sort by most frequent words and write to file
    top_all_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_all_nosw.items(), key=operator.itemgetter(1), reverse = True)))

    # join all positive reviews for wordcount
    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    top_positive_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)))

    # join all negative reviews for wordcount
    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    top_negative_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)))

    # close files
    top_all_nosw.close()
    top_positive_nosw.close()
    top_negative_nosw.close()
    
    # same set of routines, but this time allowing stopwords
    data_sw, data_positive_sw, data_negative_sw = scan.scan('finefoods.txt', not exclude_stopwords, binary_label)
    top_all_sw = open(os.path.join(sys.path[0], "top_all_sw.txt"), "wb")
    top_positive_sw = open(os.path.join(sys.path[0], "top_positive_sw.txt"), "wb")
    top_negative_sw = open(os.path.join(sys.path[0], "top_negative_sw.txt"), "wb")
   
    all_review_sw = ' '.join([row[0] for row in data_sw])
    dict_all_sw = utils.get_unigram(all_review_sw)[0]
    top_all_sw.write('\n'.join('%s %s' % x for x in sorted(dict_all_sw.items(), key=operator.itemgetter(1), reverse = True)))

    positive_review_sw = ' '.join([row[0] for row in data_positive_sw])
    dict_positive_sw = utils.get_unigram(positive_review_sw)[0]
    top_positive_sw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_sw.items(), key=operator.itemgetter(1), reverse = True)))

    negative_review_sw = ' '.join([row[0] for row in data_negative_sw])
    dict_negative_sw = utils.get_unigram(negative_review_sw)[0]
    top_negative_sw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_sw.items(), key=operator.itemgetter(1), reverse = True)))

    top_all_sw.close()
    top_positive_sw.close()
    top_negative_sw.close()
コード例 #9
0
def mostPopularPositiveNegative(data, n = 500):
    allUnigrams = defaultdict(int)
    positiveUnigrams = defaultdict(int)
    negativeUnigrams = defaultdict(int)
    for review in data:
        for unigram, freq in utils.get_unigram(review[0])[0].items():
            allUnigrams[unigram] += freq
            positiveUnigrams[unigram] += freq if review[1] == 1 else 0
            negativeUnigrams[unigram] += freq if review[1] == 0 else 0

    topAllUnigrams = sorted(allUnigrams, key=allUnigrams.get, reverse=True)
    topPositiveUnigrams = sorted(positiveUnigrams, key=positiveUnigrams.get, reverse=True)
    topNegativeUnigrams = sorted(negativeUnigrams, key=negativeUnigrams.get, reverse=True)
    print ("Top 30 Popular Unigrams", "Top 30 Positive Unigrams", "Top 30 Negative Unigrams")
    for i in xrange(30):
        print repr(topAllUnigrams[i]).center(30), repr(topPositiveUnigrams[i]).center(30), repr(topNegativeUnigrams[i]).center(30)

    return topPositiveUnigrams + topNegativeUnigrams
コード例 #10
0
def main():
    binary_label = True
    exclude_stopwords = True
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('fineshort.txt', exclude_stopwords, binary_label))
    top_all_nosw = open(os.path.join(sys.path[0], "top_all_nosw.txt"), "wb")
    top_positive_nosw = open(os.path.join(sys.path[0], "top_positive_nosw.txt"), "wb")
    top_negative_nosw = open(os.path.join(sys.path[0], "top_negative_nosw.txt"), "wb")
    
    
    all_review_nosw = ' '.join([row[0] for row in data_nosw])
    dict_all_nosw = utils.get_unigram(all_review_nosw)[0]
    top_all_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_all_nosw.items(), key=operator.itemgetter(1), reverse = True)))

    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    top_positive_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)))

    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    top_negative_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)))

    top_all_nosw.close()
    top_positive_nosw.close()
    top_negative_nosw.close()
    
    data_sw, data_positive_sw, data_negative_sw = scan.scan('fineshort.txt', not exclude_stopwords, binary_label)
    top_all_sw = open(os.path.join(sys.path[0], "top_all_sw.txt"), "wb")
    top_positive_sw = open(os.path.join(sys.path[0], "top_positive_sw.txt"), "wb")
    top_negative_sw = open(os.path.join(sys.path[0], "top_negative_sw.txt"), "wb")
   
    all_review_sw = ' '.join([row[0] for row in data_sw])
    dict_all_sw = utils.get_unigram(all_review_sw)[0]
    top_all_sw.write('\n'.join('%s %s' % x for x in sorted(dict_all_sw.items(), key=operator.itemgetter(1), reverse = True)))

    positive_review_sw = ' '.join([row[0] for row in data_positive_sw])
    dict_positive_sw = utils.get_unigram(positive_review_sw)[0]
    top_positive_sw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_sw.items(), key=operator.itemgetter(1), reverse = True)))

    negative_review_sw = ' '.join([row[0] for row in data_negative_sw])
    dict_negative_sw = utils.get_unigram(negative_review_sw)[0]
    top_negative_sw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_sw.items(), key=operator.itemgetter(1), reverse = True)))

    top_all_sw.close()
    top_positive_sw.close()
    top_negative_sw.close()
    
    length = len(data_nosw)
    train_data = data_nosw[:int(length*.8)]
    test_data = data_nosw[int(length*.8):]
コード例 #11
0
ファイル: prepro.py プロジェクト: liben2018/nagisa
def create_vocabs_from_trainset(trainset,
                                threshold=2,
                                fn_dictionary=None,
                                save_vocabs=True,
                                fn_vocabs=None,
                                oov=OOV,
                                pad=PAD):
    # Creat a word-to-POStags dictionary.
    word2postags = {}
    with open(fn_dictionary, 'r') as texts:
        for text in texts:
            text = utils.utf8rstrip(text)
            word, postag = text.split('\t')
            word = utils.normalize(word)
            # lower setting: 1
            word = word.lower()
            if word in word2postags:
                word2postags[word].append(postag)
            else:
                word2postags[word] = [postag]

    # Creat a word-to-index dictionary and a index-to-word dictionary.
    dictionary = {oov: 0, pad: 1}
    for word in word2postags.keys():
        dictionary[word] = len(dictionary)
    id2word = {i: w for w, i in dictionary.items()}

    # Creat a unigram-to-index dictionary, a bigram-to-index dictionary.
    # Reconstruct a word-to-index dictionary.
    words = []
    uni2id = {}
    bi2id = {}
    word2id = {}
    pos2id = {oov: 0}
    with open(trainset, 'r') as texts:
        for text in texts:
            text = utils.utf8rstrip(text)
            if text == 'EOS':
                sent = ''.join(words)
                unis = utils.get_unigram(sent)
                for uni in unis:
                    uni2id = update_dict(uni, uni2id)

                bis = utils.get_bigram(sent)
                for bi in bis:
                    bi2id = update_dict(bi, bi2id)

                words_at_i = utils.get_words_starting_at_i(sent, dictionary)
                words_at_i += utils.get_words_ending_at_i(sent, dictionary)
                for words in words_at_i:
                    for wid in words:
                        word = id2word[wid]
                        word2id = update_dict(word, word2id)
                words = []

            else:
                word, pos = text.split('\t')
                word = utils.normalize(word)
                word = word.replace(' ', ' ')
                # lower setting: 2
                word = word.lower()
                words.append(word)
                word2id = update_dict(word, word2id)
                pos2id = update_dict(pos, pos2id)

    # Cut keys by frequency threshold.
    uni2id = cut_by_threshold(uni2id, oov, pad, threshold)
    bi2id = cut_by_threshold(bi2id, oov, pad, threshold)
    word2id = cut_by_threshold(word2id, oov, pad, threshold)

    # Creat a POStag-to-index dictionary.
    pos2id = {k: i for i, k in enumerate(pos2id.keys())}
    word2postags = {
        k: [pos2id[p] for p in list(set(v))]
        for k, v in word2postags.items()
    }

    vocabs = [uni2id, bi2id, word2id, pos2id, word2postags]
    if save_vocabs is True:
        utils.dump_data(vocabs, fn_vocabs)

    return vocabs
コード例 #12
0
def main():
    # defaults are binary outcome and exclude stopwords
    binary_label = True
    exclude_stopwords = True

    # a modified scan that returns three lists
    data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan(
        'finefoods.txt', exclude_stopwords, binary_label))

    # open files for writing
    top_all_nosw = open(os.path.join(sys.path[0], "top_all_nosw.txt"), "wb")
    top_positive_nosw = open(
        os.path.join(sys.path[0], "top_positive_nosw.txt"), "wb")
    top_negative_nosw = open(
        os.path.join(sys.path[0], "top_negative_nosw.txt"), "wb")

    # join all reviews for wordcount
    all_review_nosw = ' '.join([row[0] for row in data_nosw])
    dict_all_nosw = utils.get_unigram(all_review_nosw)[0]
    # sort by most frequent words and write to file
    top_all_nosw.write('\n'.join('%s %s' % x for x in sorted(
        dict_all_nosw.items(), key=operator.itemgetter(1), reverse=True)))

    # join all positive reviews for wordcount
    positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw])
    dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0]
    top_positive_nosw.write('\n'.join('%s %s' % x for x in sorted(
        dict_positive_nosw.items(), key=operator.itemgetter(1), reverse=True)))

    # join all negative reviews for wordcount
    negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw])
    dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0]
    top_negative_nosw.write('\n'.join('%s %s' % x for x in sorted(
        dict_negative_nosw.items(), key=operator.itemgetter(1), reverse=True)))

    # close files
    top_all_nosw.close()
    top_positive_nosw.close()
    top_negative_nosw.close()

    # same set of routines, but this time allowing stopwords
    data_sw, data_positive_sw, data_negative_sw = scan.scan(
        'finefoods.txt', not exclude_stopwords, binary_label)
    top_all_sw = open(os.path.join(sys.path[0], "top_all_sw.txt"), "wb")
    top_positive_sw = open(os.path.join(sys.path[0], "top_positive_sw.txt"),
                           "wb")
    top_negative_sw = open(os.path.join(sys.path[0], "top_negative_sw.txt"),
                           "wb")

    all_review_sw = ' '.join([row[0] for row in data_sw])
    dict_all_sw = utils.get_unigram(all_review_sw)[0]
    top_all_sw.write('\n'.join('%s %s' % x for x in sorted(
        dict_all_sw.items(), key=operator.itemgetter(1), reverse=True)))

    positive_review_sw = ' '.join([row[0] for row in data_positive_sw])
    dict_positive_sw = utils.get_unigram(positive_review_sw)[0]
    top_positive_sw.write('\n'.join('%s %s' % x for x in sorted(
        dict_positive_sw.items(), key=operator.itemgetter(1), reverse=True)))

    negative_review_sw = ' '.join([row[0] for row in data_negative_sw])
    dict_negative_sw = utils.get_unigram(negative_review_sw)[0]
    top_negative_sw.write('\n'.join('%s %s' % x for x in sorted(
        dict_negative_sw.items(), key=operator.itemgetter(1), reverse=True)))

    top_all_sw.close()
    top_positive_sw.close()
    top_negative_sw.close()