def high_information_words(self, labelled_words, score_fn=nltk.BigramAssocMeasures.chi_sq, min_score=5): word_fd = nltk.FreqDist() label_word_fd = nltk.ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) return high_info_words
def label_feats_from_corpus(corp, feature_detector=bag_of_words): label_feats = defaultdict(list) for label in corp.categories(): for fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats
def evaluate_model(MaxEntClassifier): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) accuracy = classify.accuracy(MaxEntClassifier, validation_features)*100 accuracy_list.append(accuracy) for i, (feats, label) in enumerate(validation_features): refsets[label].add(i) observed = MaxEntClassifier.classify(feats) testsets[observed].add(i) negative_precision = precision(refsets['negative'], testsets['negative']) positive_precision = precision(refsets['positive'], testsets['positive']) positive_recall = recall(refsets['positive'], testsets['positive']) negative_recall = recall(refsets['negative'], testsets['negative']) try: avg_recall = 0.5*(negative_recall+positive_recall) avg_precision = 0.5*(negative_precision+positive_precision) precision_list.append(avg_precision) recall_list.append(avg_recall) except TypeError: pass return precision_list, recall_list, accuracy_list
def getAccuracy(self, classifier, sentences): test_set = nltk.classify.apply_features(self.extract_features_unigram, sentences[:500]) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) classifierResult = {} for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) classifierResult['accuracy'] = nltk.classify.util.accuracy( classifier, test_set) classifierResult['suggestion precision'] = 0.0 if nltk.precision( refsets['suggestion'], testsets['suggestion']) == None else nltk.precision( refsets['suggestion'], testsets['suggestion']) classifierResult['suggestion recall'] = 0.0 if nltk.recall( refsets['suggestion'], testsets['suggestion']) == None else nltk.recall( refsets['suggestion'], testsets['suggestion']) classifierResult['nonsuggestion precision'] = 0.0 if nltk.precision( refsets['nonsuggestion'], testsets['nonsuggestion']) == None else nltk.precision( refsets['nonsuggestion'], testsets['nonsuggestion']) classifierResult['nonsuggestion recall'] = 0.0 if nltk.recall( refsets['nonsuggestion'], testsets['nonsuggestion']) == None else nltk.recall( refsets['nonsuggestion'], testsets['nonsuggestion']) return classifierResult
def cluster_texts(texts, clusters=3): """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """ vectorizer = TfidfVectorizer(tokenizer=process_text, stop_words=stopwords.words('english'), max_df=0.5, min_df=0.1, lowercase=True) tfidf_model = vectorizer.fit_transform(texts) km_model = KMeans(n_clusters=clusters) km_model.fit(tfidf_model) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) return clustering
def createHistogram(self, relativePathToHistogram): """ Creates the Histogram for comparing later on The dictionary is stored in the self.histogram-parameter, with all the words in the referencetext and the associated number of occurences. The dictionary also includes one key called "totalWordsInFile" in which has the total count """ self.histogram = collections.defaultdict(int) count = 0 # Use one double for-loop for efficiency reasons with open(os.path.join(os.path.dirname(__file__), relativePathToHistogram), mode='r', encoding='utf-8-sig') as file: for sentence in file.read().splitlines(): # Removes non-words, and splits hyphenated words using nltk for word in RegexpTokenizer(r'\w+').tokenize(sentence): self.histogram[word.lower()] += 1 count += 1 self.histogram['totalWordsInFile'] = count
def calc_model(): global word_features, classifier documents = [] pos = 0 neg = 0 with open("data.csv") as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for record in csv_reader: ap = (' '.join( re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", record[1]).split())) ap = word_tokenize(ap) documents.append((ap, record[0])) if '0' == record[0]: neg = neg + 1 elif '1' == record[0]: pos = pos + 1 print("neg ", neg) print("pos ", pos) shuffle(documents) all_words = [] for tweet in documents: for w in tweet[0]: all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) print("getting features") word_features = list(all_words.keys())[:1000] save_pickle(pickle_word_features, word_features) print("saved word features") print("setting features per tweet") feature_sets = np.array([[find_features(tweet), category] for (tweet, category) in documents]) data = feature_sets[:, 0] k = 10 cv = KFold(k) accur = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] i = 0 for train_index, test_index in cv.split(data): print("starting split " + str(i + 1)) training_this_round = feature_sets[train_index] testing_this_round = feature_sets[test_index] linear_svc_classifier = SklearnClassifier(LinearSVC()) classifier = linear_svc_classifier.train(training_this_round) accur.insert( i, nltk.classify.util.accuracy(classifier, testing_this_round)) print('accuracy:', accur[i]) i = i + 1 refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for j, (feats, label) in enumerate(testing_this_round): refsets[label].add(j) observed = classifier.classify(feats) testsets[observed].add(j) cv_pos_precision = precision(refsets['1'], testsets['1']) cv_pos_recall = recall(refsets['1'], testsets['1']) cv_neg_precision = precision(refsets['0'], testsets['0']) cv_neg_recall = recall(refsets['0'], testsets['0']) print('Precision:', precision(refsets['1'], testsets['1'])) print('Recall:', recall(refsets['1'], testsets['1'])) print('Precision neg:', precision(refsets['0'], testsets['0'])) print('Recall neg:', recall(refsets['0'], testsets['0'])) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur)) print('precision', (sum(pos_precision) / len(accur) + sum(neg_precision) / len(accur)) / 2) print('recall', (sum(pos_recall) / len(accur) + sum(neg_recall) / len(accur)) / 2) save_pickle(pickle_model, classifier)
def groupAnagrams(self, strs: List[str]) -> List[List[str]]: ans = collections.defaultdict(list) for ch in strs: ans[tuple(sorted(ch))].append(ch) return ans.values()
# divide data into training and testing datasets threshold_factor = 0.8 # split data into train and test (80/20) threshold_pos = int(threshold_factor * len(features_pos)) threshold_neg = int(threshold_factor * len(features_neg)) # extract the features features_train = features_pos[:threshold_pos] + features_neg[:threshold_neg] features_test = features_pos[threshold_pos:] + features_neg[threshold_neg:] print("\nNumber of training datapoints: ", len(features_train)) print("Number of test datapoints: ", len(features_test)) # define classifer object and train it NBClassifier = NaiveBayesClassifier.train(features_train) print("\nAccuracy of NBClassifer: ", nltk.classify.util.accuracy(NBClassifier, features_test)) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (features, label) in enumerate(features_test): refsets[label].add(i) observed = NBClassifier.classify(features) testsets[observed].add(i) print("refsets: ", refsets) print("testsets: ", testsets) # print top 10 most informative words for item in NBClassifier.most_informative_features()[:10]: print(item[0]) # sample input sentences