def testClassifiers(dataset, out): names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis", "Quadratic Discriminant Analysis"] # names = ["Linear SVM", "RBF SVM"] classifiers = [ KNeighborsClassifier(10), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), #SVC(kernel = 'linear', cache_size = 1500), #SVC(kernel = 'rbf', cache_size = 1500)] ] for clf, name in zip(classifiers, names): globe.getLogger().info('Testing Classifier: %s', name) out.write(name + '\n') scores = cross_validation.cross_val_predict(clf, dataset.instances, dataset.labels, cv = 10, verbose = 0) cm = confusion_matrix(dataset.labels, scores) out.write('%d\t%d\t%d\t%d\t%.10f\t%.10f\t%.10f\t%.10f\n' % ( cm[0][0], cm[0][1], cm[1][0], cm[1][1], precision_score(dataset.labels, scores), recall_score(dataset.labels, scores), accuracy_score(dataset.labels, scores), f1_score(dataset.labels, scores) ))
def testClassifiers(dataset, out): names = [ "Nearest Neighbors", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis", "Quadratic Discriminant Analysis" ] # names = ["Linear SVM", "RBF SVM"] classifiers = [ KNeighborsClassifier(10), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), #SVC(kernel = 'linear', cache_size = 1500), #SVC(kernel = 'rbf', cache_size = 1500)] ] for clf, name in zip(classifiers, names): globe.getLogger().info('Testing Classifier: %s', name) out.write(name + '\n') scores = cross_validation.cross_val_predict(clf, dataset.instances, dataset.labels, cv=10, verbose=0) cm = confusion_matrix(dataset.labels, scores) out.write('%d\t%d\t%d\t%d\t%.10f\t%.10f\t%.10f\t%.10f\n' % (cm[0][0], cm[0][1], cm[1][0], cm[1][1], precision_score(dataset.labels, scores), recall_score(dataset.labels, scores), accuracy_score(dataset.labels, scores), f1_score(dataset.labels, scores)))
def unigramDF(): globe.getLogger().info('Counting Unigram') def splitter(instance): return set(instance.split()) countDF(splitter, UNIGRAM_DF_OUTPUT)
def user2vec(words, d, normalized=True, discardMissing=True): globe.getLogger().info( 'Converting Users to Vectors (NORMALIZED = %r, DISCARD_MISSING_WORD = %r)' % (normalized, discardMissing)) missingWords = {} dim = words[words.keys()[0]].shape[0] vectorInstances = [] for i, instance in enumerate(d.instances): if i % 1000 == 0 and i != 0: globe.getLogger().info('processed %d instance' % i) vector = np.zeros(dim) wCount = 0 for w in instance.split(): if w in words: wCount += 1 vector += words[w] else: if not discardMissing: if w not in missingWords: missingWords[w] = np.random.rand(dim) vector += missingWords[w] if normalized: if wCount != 0: vector /= wCount vectorInstances.append(vector) # remove zero vectors vecDataset = Dataset() globe.getLogger().info('Removing zero vectors') zeroCount = 0 for i, vector in enumerate(vectorInstances): if np.count_nonzero(vector) == 0: globe.getLogger().info('User %d is empty' % d.users[i]) zeroCount += 1 else: vecDataset.users.append(d.users[i]) vecDataset.labels.append(d.labels[i]) vecDataset.instances.append(vector) globe.getLogger().info('Total Found: %d' % zeroCount) if normalized: if discardMissing: fout = USER_VECTOR_NORM_DISMW_DATASET else: fout = USER_VECTOR_NORM_DATASET else: if discardMissing: fout = USER_VECTOR_DISMW_DATASET else: fout = USER_VECTOR_DATASET vecDataset.save(fout)
def loadDFFeatures(featurePath): globe.getLogger().info('Loading Features from %s' % featurePath) with open(featurePath) as fin: features = []#{} for i, l in enumerate(fin.xreadlines()): if i % 100000 == 0 and i != 0: globe.getLogger().info('loaded %d features' % i) gram, df1, df2, score = l.strip().split() #features[gram] = [int(df1), int(df2), float(score)] features.append(gram) return features
def bigramDF(): globe.getLogger().info('Counting Bigram') def splitter(instance): bigrams = set() for l in instance.split('\n'): words = l.split() for i in range(len(words) - 1): bigrams.add(words[i] + '_' + words[i + 1]) return bigrams countDF(splitter, BIGRAM_DF_OUTPUT)
def testClassifiers(dataset, out): names = ['MultinomialNB', 'BernoulliNB'] classifiers = [ MultinomialNB, BernoulliNB ] for clf, name in zip(classifiers, names): globe.getLogger().info('Testing Classifier: %s', name) out.write('Testing Classifier: %s\n' % name) cv.crossValidate(dataset, clf(), out) scores = cross_validation.cross_val_predict(clf(), dataset.instances, dataset.labels, cv = 10, verbose = 0) out.write('Total:\n'+ str(confusion_matrix(dataset.labels, scores)))
def testClassifiers(dataset, out): names = ['MultinomialNB', 'BernoulliNB'] classifiers = [MultinomialNB, BernoulliNB] for clf, name in zip(classifiers, names): globe.getLogger().info('Testing Classifier: %s', name) out.write('Testing Classifier: %s\n' % name) cv.crossValidate(dataset, clf(), out) scores = cross_validation.cross_val_predict(clf(), dataset.instances, dataset.labels, cv=10, verbose=0) out.write('Total:\n' + str(confusion_matrix(dataset.labels, scores)))
def train(dataset, dim): globe.getLogger().info('Training Paragraph Vectors (Dimension = %d)' % dim) model = Doc2Vec(size = dim, window = 8, workers = 8, alpha=0.025, min_alpha=0.025, min_count = 2) model.build_vocab(dataset.instances) for epoch in range(10): globe.getLogger().info('Training %d time' % epoch) model.train(dataset.instances) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no deca model.train(dataset.instances) model.save(PARA_MODEL) return model
def loadParaDataset(): globe.getLogger().info('Loading Dataset') dataset = loadDataset(DOC_DATAEST) paraDataset = Dataset() paraDataset.users = dataset.users paraDataset.labels = dataset.labels for i, instance in enumerate(dataset.instances): if i % 100 == 0: globe.getLogger().info('Processed %d instances' % i) paraDataset.instances.append(LabeledSentence(words = instance.split(), tags = [u'T_%d' % i])) dataset.save(PRE_PARA_DOC) return paraDataset
def loadParaDataset(): globe.getLogger().info('Loading Dataset') dataset = loadDataset(DOC_DATAEST) paraDataset = Dataset() paraDataset.users = dataset.users paraDataset.labels = dataset.labels for i, instance in enumerate(dataset.instances): if i % 100 == 0: globe.getLogger().info('Processed %d instances' % i) paraDataset.instances.append( LabeledSentence(words=instance.split(), tags=[u'T_%d' % i])) dataset.save(PRE_PARA_DOC) return paraDataset
def loadWord2vec(dim): globe.getLogger().info('Start Loading word vector file') with open(PATH + '/tweets-%d.bin.txt' % dim) as fin: words = {} count, dim = fin.readline().strip().split() dim = int(dim) for i, l in enumerate(fin.xreadlines()): if i % 100000 == 0 and i != 0: globe.getLogger().info('read %d lines' % i) vector = l.strip().split() words[vector[0]] = np.array([float(v) for v in vector[1:]]) return words
def test(model, dim, dataset, out): instances = [model.docvecs[u'T_%d' % i] for i in range(len(dataset.instances))] d = Dataset() zeroCount = 0 for i, ins in enumerate(instances): if np.isfinite(ins).all(): d.labels.append(dataset.labels[i]) d.instances.append(ins) d.users.append(dataset.users[i]) else: zeroCount += 1 d.save(PATH + 'paragraph-vector-s%d.obj' % dim) globe.getLogger().info('Zero Count: %d' % zeroCount) '''
def read(self, fname): ''' Read dataset from original dataset file. Format: !ID \t IS_SUSPENDED TWEET1 TWEET2 ... !ID \t IS_SUSPENDED ... ''' users = [] u = {} with open(fname) as fin: globe.getLogger().info('Start reading file: %s', fname) for i, l in enumerate(fin.xreadlines()): l = l.strip() if i != 0 and i % 10000 == 0: globe.getLogger().info('Read %d lines', i) if l != '': if l[0] == '!': if 'id' in u: users.append(u) u = {} uid, isSuspended = l.split() u['id'] = int(uid[1:]) u['suspended'] = int(isSuspended) u['tweets'] = [] else: u['tweets'].append(l) users.append(u) for u in users: self.users.append(u['id']) self.labels.append(u['suspended']) self.instances.append('\n'.join(u['tweets']))
def test(model, dim, dataset, out): instances = [ model.docvecs[u'T_%d' % i] for i in range(len(dataset.instances)) ] d = Dataset() zeroCount = 0 for i, ins in enumerate(instances): if np.isfinite(ins).all(): d.labels.append(dataset.labels[i]) d.instances.append(ins) d.users.append(dataset.users[i]) else: zeroCount += 1 d.save(PATH + 'paragraph-vector-s%d.obj' % dim) globe.getLogger().info('Zero Count: %d' % zeroCount) '''
def train(dataset, dim): globe.getLogger().info('Training Paragraph Vectors (Dimension = %d)' % dim) model = Doc2Vec(size=dim, window=8, workers=8, alpha=0.025, min_alpha=0.025, min_count=2) model.build_vocab(dataset.instances) for epoch in range(10): globe.getLogger().info('Training %d time' % epoch) model.train(dataset.instances) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no deca model.train(dataset.instances) model.save(PARA_MODEL) return model
def evaluateDF(dataset, features, top): globe.getLogger().info('Evaluating with feature count = %d' % top) # filter dataset filteredDS = Dataset() def addFeature(ins, fins, f): if f in ins: fins[f] = ins[f] else: fins[f] = 0 globe.getLogger().info('Filtering Dataset') count = 0 for uid, label, instance in zip(dataset.users, dataset.labels, dataset.instances): if count % 100 == 0 and count != 0: globe.getLogger().info('processed %d instances' % count) filteredInstance = {} for i in range(top / 2): addFeature(instance, filteredInstance, features[i]) addFeature(instance, filteredInstance, features[-(i+1)]) filteredDS.users.append(uid) filteredDS.labels.append(label) filteredDS.instances.append(filteredInstance) count += 1 filteredDS.instances = DictVectorizer().fit_transform(filteredDS.instances) # evaluate text_cls.testClassifiers(filteredDS)
def countDF(splitter, outputFilename): dataset = loadDataset(DOC_DATAEST) sGram = Counter() nsGram = Counter() i = 0 for label, instance in zip(dataset.labels, dataset.instances): if i % 100 == 0 and i != 0: globe.getLogger().info('processed %d instances' % i) grams = splitter(instance) for g in grams: if label == SUSPENDED_LABEL: sGram[g] += 1 else: nsGram[g] += 1 i += 1 globe.getLogger().info('Len(sGram) = %d, Len(nsGram) = %d' % (len(sGram), len(nsGram))) features = {} for g, c in sGram.items(): if g not in features: features[g] = [c, 0, 0.0] else: features[g][0] = c for g, c in nsGram.items(): if g not in features: features[g] = [0, c, 0.0] else: features[g][1] = c globe.getLogger().info('Sorting Grams by DF') features = [(k, v[0], v[1], float(v[0] + 1) / (v[1] + 1)) for k, v in features.items()] features.sort(key = lambda item : -item[-1]) globe.getLogger().info('Saving Result') with open(outputFilename, 'w') as fout: for f in features: fout.write('%s\t%d\t%d\t%.10f\n' % f)
def testMI(d): globe.getLogger().info('length of vocabulary = %d', len(d.vocabulary)) # print d.instances.shape c = biclass_mutual_info(d.instances, d.labels) scores = [(d.vocabulary[i], sf) for i, sf in enumerate(c)] scores.sort(key = lambda i: -i[1]) with open(MI_OUTPUT, 'w') as fout: for s in scores: fout.write('%s\t%.10f\n' % s) size = 1 while 10 ** size < len(d.vocabulary): clf = MultinomialNB() X = SelectKBest(biclass_mutual_info, k= 10 ** size).fit_transform(d.instances, d.labels) scores = cross_validation.cross_val_predict(clf, X, d.labels, cv = 10, verbose = 0) globe.getLogger().info('10^%d\t%.6f\t%.6f', size, accuracy_score(d.labels, scores), f1_score(d.labels, scores)) globe.getLogger().info(confusion_matrix(d.labels, scores)) size += 1 clf = MultinomialNB() scores = cross_validation.cross_val_predict(clf, d.instances, d.labels, cv = 10, verbose = 0) globe.getLogger().info('%.1e\t%.6f\t%.6f', len(d.vocabulary), accuracy_score(d.labels, scores), f1_score(d.labels, scores))
def crossValidate(dataset, cls, out): kfold = KFold(len(dataset.users), 10, shuffle = True, random_state = 42) count = 1 for trainIndex, testIndex in kfold: out.write('Cross Validation %d Time\n' % count) globe.getLogger().info('Cross Validation %d Time' % count) trainX = dataset.instances[trainIndex] trainY = dataset.labels[trainIndex] testX = dataset.instances[testIndex] testY = dataset.labels[testIndex] globe.getLogger().info('Training...') cls.fit(trainX, trainY) globe.getLogger().info('Testing...') predicted = cls.predict(testX) out.write(np.array_str(confusion_matrix(testY, predicted)) + '\n') count += 1
def crossValidate(dataset, cls, out): kfold = KFold(len(dataset.users), 10, shuffle=True, random_state=42) count = 1 for trainIndex, testIndex in kfold: out.write('Cross Validation %d Time\n' % count) globe.getLogger().info('Cross Validation %d Time' % count) trainX = dataset.instances[trainIndex] trainY = dataset.labels[trainIndex] testX = dataset.instances[testIndex] testY = dataset.labels[testIndex] globe.getLogger().info('Training...') cls.fit(trainX, trainY) globe.getLogger().info('Testing...') predicted = cls.predict(testX) out.write(np.array_str(confusion_matrix(testY, predicted)) + '\n') count += 1
d.read(ORIGINAL_DATASET) d.save(DOC_DATAEST) d1 = Dataset() d1.users = d.users d1.labels = np.array(d.labels) count_vect = CountVectorizer(token_pattern=r'\S+') X_train_counts = count_vect.fit_transform(d.instances) d1.instances = X_train_counts d1.save(TOKEN_COUNT_DATASET) d1.vocabulary = {v:k for k, v in count_vect.vocabulary_.items()} # tfidf_transformer = TfidfTransformer() # d1.instances = tfidf_transformer.fit_transform(X_train_counts) # d1.save(TOKEN_NORM_COUNT_DATASET) count_vect = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\S+') X_train_counts = count_vect.fit_transform(d.instances) d1.instances = X_train_counts d1.vocabulary = {v:k for k, v in count_vect.vocabulary_.items()} d1.save(BIGRAM_TOKEN_COUNT_DATASET) # tfidf_transformer = TfidfTransformer() # d1.instances = tfidf_transformer.fit_transform(X_train_counts) # d1.save(BIGRAM_TOKEN_NORM_COUNT_DATASET) if __name__ == '__main__': globe.getLogger().info('Initialize Dataset') initNGramDatasets()
def save(self, fname): globe.getLogger().info('Save Dataset to %s', fname) with open(fname, 'wb') as fout: pickle.dump(self, fout)
def loadDataset(fname): globe.getLogger().info('Load Dataset from %s', fname) with open(fname) as fin: return pickle.load(fin)
d.save(DOC_DATAEST) d1 = Dataset() d1.users = d.users d1.labels = np.array(d.labels) count_vect = CountVectorizer(token_pattern=r'\S+') X_train_counts = count_vect.fit_transform(d.instances) d1.instances = X_train_counts d1.save(TOKEN_COUNT_DATASET) d1.vocabulary = {v: k for k, v in count_vect.vocabulary_.items()} # tfidf_transformer = TfidfTransformer() # d1.instances = tfidf_transformer.fit_transform(X_train_counts) # d1.save(TOKEN_NORM_COUNT_DATASET) count_vect = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\S+') X_train_counts = count_vect.fit_transform(d.instances) d1.instances = X_train_counts d1.vocabulary = {v: k for k, v in count_vect.vocabulary_.items()} d1.save(BIGRAM_TOKEN_COUNT_DATASET) # tfidf_transformer = TfidfTransformer() # d1.instances = tfidf_transformer.fit_transform(X_train_counts) # d1.save(BIGRAM_TOKEN_NORM_COUNT_DATASET) if __name__ == '__main__': globe.getLogger().info('Initialize Dataset') initNGramDatasets()