Esempio n. 1
0
def buildTFIDFDictionary(csvName):
    things = csv_object.getThings(csvName)
    global words
    global descriptions

    for thingy in things:
        description = thingy.description
        descriptions.append(description)
        wordInDoc = description
        words = words.union(wordInDoc)

    # dictionary = [["None" for x in range(len(things)+1)]"None " for x in range(len(words)+1)]] #define matrix of things and words
    # dictionary = {}

    # multiprocessing
    print cpu_count(), len(things)
    thingPool = Pool(cpu_count())
    results = thingPool.map(thingThreadHelper, things)

    # for i, thingy in enumerate(things):
    #     dictionary[i] = thing.title
    #     for j, word in enumerate(words):
    #         dictionary[i][0]= word
    #         dictionary[i][j] = tfidf(word, thingy.description, descriptions)))

    # print type(results)
    # print results

    # for r in results:
    #     print r
    return results
from numpy import array
import csv_object
from random import shuffle
from sklearn.svm import SVC

# features: collected, commented, downloads, likes, remixes, views
# response: real ( determined my makes >= 10)

filename = "FinalItems-Full.csv"
# filename = "small.csv"

# get our starting data
things = csv_object.getThings(filename)
shuffle(things)

c = 0
for thing in things:
    if thing.real:
        c += 1
print c, len(things)

training = things[0:len(things) - len(things) / 10]
evaluation = things[len(things) - len(things) / 10:len(things)]

# create data and target lists from training set
dataT = []
targetT = []
for x in training:
    dataT.append([x.collected, x.commented, x.downloads, x.likes, x.remixed, x.views])
    targetT.append(x.real)