Exemple #1
0
def assign():
    assignment = []
    possible = []
    true_count = 0
    for i in categories:
        test_text = rd.filereader([i],'../train/')
        test_text = pd.Series([correct(test) for test in test_text])
        val = [kmeans.predict(a)[0] for a in vectorizer.transform(test_text)]
        count = Counter(val)
        print(count.most_common(3))
        if count.most_common(1)[0][0] in possible:
            assignment.append((count.most_common(2)[1][0],i))
            true_count += count.most_common(2)[1][1]
            possible.append(count.most_common(2)[1][0])
        else:
            assignment.append((count.most_common(1)[0][0],i))
            true_count += count.most_common(1)[0][1]
            possible.append(count.most_common(1)[0][0])
    accuracy = float(((true_count)/shape)*100)
    return dict(assignment), accuracy, true_count
Exemple #2
0
    return True

def tokenset(node, order):
    if order == tree.tnode.WALK_POST and (not node.start and node.children):
        node.start = node.children[0].start
        node.end   = node.children[-1].end
    return True

import block
import exp

if __name__ == "__main__":
    import sys
    import reader

    s = S(parser.pushstream(lex.lex(reader.filereader(sys.stdin))))
    if not s.parse():
        print("Cannot parse")
    elif not s.atend():
        print("Garbage after S")
    else:
        print(s.pprint())
        print(s.printexp())

        s.block = proarkhe.proarkhe()
        s.visit(parentset)
        s.walk(tokenset)
        s.reform([exp.simplify_depth], [exp.simplify_arith])
        s.visit(parentset)
        s.visit(block.blockset)
        s.visit(block.declsset)
Exemple #3
0
def plot(name):
    text_test = (rd.filereader([name], '../train/'))
    text_test = pd.Series([correct(test) for test in text_test])
    x = range(0,text_test.shape[0])
    y = [kmeans.predict(a) for a in vectorizer.transform(text_test)]
    plt.scatter(x,y)
Exemple #4
0
            true_count += count.most_common(2)[1][1]
            possible.append(count.most_common(2)[1][0])
        else:
            assignment.append((count.most_common(1)[0][0],i))
            true_count += count.most_common(1)[0][1]
            possible.append(count.most_common(1)[0][0])
    accuracy = float(((true_count)/shape)*100)
    return dict(assignment), accuracy, true_count

#%%
def predict():
        text = open('./test.txt','r').read()
        return assignment[kmeans.predict(vectorizer.transform([correct(text)]))[0]]

#%%
categories = ['business','entertainment','politics','sport','tech']
text = pd.Series(rd.filereader(categories, '../train/'))
shape = text.shape[0]
text = [correct(t) for t in text]
#text = lm.wordtokenize(text)
vectorizer = TfidfVectorizer(stop_words='english', encoding='iso-8859-1', ngram_range=(1,2))
text_vector = vectorizer.fit_transform(text)
kmeans = KMeans(n_clusters=5)
kmeans.fit(text_vector)
assignment = []
assignment, accuracy, true_count = assign()
print (assignment)

#%%
pickle.dump([vectorizer,assignment,kmeans], open('../pickel/newssort', 'wb'))