Example #1
0
    def high_information_words(self,
                               labelled_words,
                               score_fn=nltk.BigramAssocMeasures.chi_sq,
                               min_score=5):
        word_fd = nltk.FreqDist()
        label_word_fd = nltk.ConditionalFreqDist()
        for label, words in labelled_words:
            for word in words:
                word_fd[word] += 1
                label_word_fd[label][word] += 1

        n_xx = label_word_fd.N()
        high_info_words = set()

        for label in label_word_fd.conditions():
            n_xi = label_word_fd[label].N()
            word_scores = collections.defaultdict(int)
            for word, n_ii in label_word_fd[label].items():
                n_ix = word_fd[word]
                score = score_fn(n_ii, (n_ix, n_xi), n_xx)
                word_scores[word] = score
            bestwords = [
                word for word, score in word_scores.items()
                if score >= min_score
            ]
            high_info_words |= set(bestwords)

        return high_info_words
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
    label_feats = defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]):
            feats = feature_detector(corp.words(fileids=[fileid]))
            label_feats[label].append(feats)
    return label_feats
def evaluate_model(MaxEntClassifier):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    accuracy = classify.accuracy(MaxEntClassifier, validation_features)*100
    accuracy_list.append(accuracy)

    for i, (feats, label) in enumerate(validation_features):
        refsets[label].add(i)
        observed = MaxEntClassifier.classify(feats)
        testsets[observed].add(i)
        negative_precision = precision(refsets['negative'], testsets['negative'])
        positive_precision = precision(refsets['positive'], testsets['positive'])
        positive_recall = recall(refsets['positive'], testsets['positive'])
        negative_recall = recall(refsets['negative'], testsets['negative'])
        try:
            avg_recall = 0.5*(negative_recall+positive_recall)
            avg_precision = 0.5*(negative_precision+positive_precision)
            precision_list.append(avg_precision)
            recall_list.append(avg_recall)
        except TypeError:
            pass
    return precision_list, recall_list, accuracy_list
Example #4
0
    def getAccuracy(self, classifier, sentences):
        test_set = nltk.classify.apply_features(self.extract_features_unigram,
                                                sentences[:500])
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        classifierResult = {}

        for i, (feats, label) in enumerate(test_set):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        classifierResult['accuracy'] = nltk.classify.util.accuracy(
            classifier, test_set)

        classifierResult['suggestion precision'] = 0.0 if nltk.precision(
            refsets['suggestion'],
            testsets['suggestion']) == None else nltk.precision(
                refsets['suggestion'], testsets['suggestion'])

        classifierResult['suggestion recall'] = 0.0 if nltk.recall(
            refsets['suggestion'],
            testsets['suggestion']) == None else nltk.recall(
                refsets['suggestion'], testsets['suggestion'])

        classifierResult['nonsuggestion precision'] = 0.0 if nltk.precision(
            refsets['nonsuggestion'],
            testsets['nonsuggestion']) == None else nltk.precision(
                refsets['nonsuggestion'], testsets['nonsuggestion'])

        classifierResult['nonsuggestion recall'] = 0.0 if nltk.recall(
            refsets['nonsuggestion'],
            testsets['nonsuggestion']) == None else nltk.recall(
                refsets['nonsuggestion'], testsets['nonsuggestion'])

        return classifierResult
Example #5
0
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)

    clustering = collections.defaultdict(list)

    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)

    return clustering
Example #6
0
    def createHistogram(self, relativePathToHistogram):
        """
        Creates the Histogram for comparing later on

        The dictionary is stored in the self.histogram-parameter, with
        all the words in the referencetext and the associated number of occurences.
        The dictionary also includes one key called "totalWordsInFile" in which has the total count
        """
        self.histogram = collections.defaultdict(int)
        count = 0
        # Use one double for-loop for efficiency reasons
        with open(os.path.join(os.path.dirname(__file__),
                               relativePathToHistogram),
                  mode='r',
                  encoding='utf-8-sig') as file:
            for sentence in file.read().splitlines():
                # Removes non-words, and splits hyphenated words using nltk
                for word in RegexpTokenizer(r'\w+').tokenize(sentence):
                    self.histogram[word.lower()] += 1
                    count += 1
        self.histogram['totalWordsInFile'] = count
Example #7
0
def calc_model():
    global word_features, classifier
    documents = []
    pos = 0
    neg = 0
    with open("data.csv") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for record in csv_reader:
            ap = (' '.join(
                re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)", " ",
                       record[1]).split()))
            ap = word_tokenize(ap)
            documents.append((ap, record[0]))
            if '0' == record[0]:
                neg = neg + 1
            elif '1' == record[0]:
                pos = pos + 1

    print("neg ", neg)
    print("pos ", pos)

    shuffle(documents)

    all_words = []
    for tweet in documents:
        for w in tweet[0]:
            all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    print("getting features")
    word_features = list(all_words.keys())[:1000]

    save_pickle(pickle_word_features, word_features)
    print("saved word features")

    print("setting features per tweet")
    feature_sets = np.array([[find_features(tweet), category]
                             for (tweet, category) in documents])

    data = feature_sets[:, 0]

    k = 10
    cv = KFold(k)
    accur = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    i = 0
    for train_index, test_index in cv.split(data):
        print("starting split " + str(i + 1))
        training_this_round = feature_sets[train_index]
        testing_this_round = feature_sets[test_index]
        linear_svc_classifier = SklearnClassifier(LinearSVC())
        classifier = linear_svc_classifier.train(training_this_round)
        accur.insert(
            i, nltk.classify.util.accuracy(classifier, testing_this_round))
        print('accuracy:', accur[i])
        i = i + 1
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for j, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(j)
            observed = classifier.classify(feats)
            testsets[observed].add(j)

        cv_pos_precision = precision(refsets['1'], testsets['1'])
        cv_pos_recall = recall(refsets['1'], testsets['1'])
        cv_neg_precision = precision(refsets['0'], testsets['0'])
        cv_neg_recall = recall(refsets['0'], testsets['0'])

        print('Precision:', precision(refsets['1'], testsets['1']))
        print('Recall:', recall(refsets['1'], testsets['1']))
        print('Precision neg:', precision(refsets['0'], testsets['0']))
        print('Recall neg:', recall(refsets['0'], testsets['0']))
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)

    print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur))
    print('precision',
          (sum(pos_precision) / len(accur) + sum(neg_precision) / len(accur)) /
          2)
    print('recall',
          (sum(pos_recall) / len(accur) + sum(neg_recall) / len(accur)) / 2)

    save_pickle(pickle_model, classifier)
Example #8
0
def groupAnagrams(self, strs: List[str]) -> List[List[str]]:
    ans = collections.defaultdict(list)
    for ch in strs:
        ans[tuple(sorted(ch))].append(ch)
    return ans.values()
    # divide data into training and testing datasets
    threshold_factor = 0.8  # split data into train and test (80/20)
    threshold_pos = int(threshold_factor * len(features_pos))
    threshold_neg = int(threshold_factor * len(features_neg))

    # extract the features
    features_train = features_pos[:threshold_pos] + features_neg[:threshold_neg]
    features_test = features_pos[threshold_pos:] + features_neg[threshold_neg:]
    print("\nNumber of training datapoints: ", len(features_train))
    print("Number of test datapoints: ", len(features_test))

    # define classifer object and train it
    NBClassifier = NaiveBayesClassifier.train(features_train)
    print("\nAccuracy of NBClassifer: ", nltk.classify.util.accuracy(NBClassifier, features_test))

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (features, label) in enumerate(features_test):
        refsets[label].add(i)
        observed = NBClassifier.classify(features)
        testsets[observed].add(i)

    print("refsets: ", refsets)
    print("testsets: ", testsets)

    # print top 10 most informative words
    for item in NBClassifier.most_informative_features()[:10]:
        print(item[0])

    # sample input sentences