Beispiel #1
0
    sys.exit(1)
else:
    big5 = sys.argv[1].upper()
    dataset = sys.argv[2]
    dataset_path = sys.argv[3]
    shuffle = sys.argv[4]

posts = []
yO = []
yC = []
yE = []
yA = []
yN = []

print("Loading myPersonality...")
[posts, yO, yC, yE, yA, yN] = dsu.readMyPersonality()
print("Loading embeddings dataset...")
if dataset == 'fasttext':
    transform = True
    wordDictionary = dsu.parseFastText(dataset_path)
else:
    transform = False
    wordDictionary = dsu.loadEmbeddingsDataset(dataset_path)
print("Data successfully loaded.")

filename = "tuning_LASSO_" + big5 + "_" + dataset

if shuffle == 'True' or shuffle == 'yes' or shuffle == 'true':
    s = np.arange(posts.shape[0])
    np.random.shuffle(s)
    posts = posts[s]
    for document in data:
        words = analyzer(document)
        if len(words) < 1:
            #move to the next document
            continue
        for word in words:
            tot_words += 1
            try:
                word_embedding = embeddings_dataset[word]
                found_words += 1
            except KeyError:
                continue
    return [tot_words, found_words]


print("Loading train data...")
[data, y_O, y_C, y_E, y_A, y_N] = dsu.readMyPersonality()

vectorizer = CountVectorizer(stop_words="english", analyzer="word")
analyzer = vectorizer.build_analyzer()

print("Loading Datasets...")
wordDictionary = dsu.parseFastText("../FastText/dataset.vec")
print("FastText loaded.")

[words, hits] = countWords(wordDictionary)

print("tot_words:", words)
print("found_words:", hits)
print("word coverage: %.2f%%" % float((100 * hits) / words))