Example #1
0
    def createVocab(self, dirs):
        print("Creating vocab mapping (max size: %d, min frequency: %d)..." % (self.max_vocab_size, self.min_count))
        dic = {}
        for d in dirs:
            indices = []
            for f in os.listdir(d):
                with open(os.path.join(d, f), 'r') as review:
                    tokens = tokenizer.tokenize(review.read().lower(), self.remove_punct, self.remove_stopwords)
                    for t in tokens:
                        if t not in dic:
                            dic[t] = 1
                        else:
                            dic[t] += 1
        d = {}
        counter = 0
        with open(self.dataDir + 'vocab.txt', 'w') as v:
            for w in sorted(dic, key=dic.get, reverse=True):
                # take word more frequent than min_count
                if dic[w] < self.min_count: break

                v.write(w + " " + str(dic[w]) + "\n")

                d[w] = counter
                counter += 1
                # take most frequent max_vocab_size tokens
                if self.max_vocab_size > -1 and counter >= self.max_vocab_size: break
Example #2
0
    def createVocab_old(self, dirs):
        print("Creating vocab mapping (max size: %d, min frequency: %d)..." % (self.max_vocab_size, self.min_count))
        dic = {}
        for d in dirs:
            indices = []
            for f in os.listdir(d):
                with open(os.path.join(d, f), 'r') as review:
                    tokens = tokenizer.tokenize(review.read().lower(), self.remove_punct, self.remove_stopwords)
                    for t in tokens:
                        if t not in dic:
                            dic[t] = 1
                        else:
                            dic[t] += 1
        d = {}
        counter = 0
        for w in sorted(dic, key=dic.get, reverse=True):
            # take word more frequent than min_count
            if dic[w] < self.min_count: break
            d[w] = counter
            counter += 1
            # take most frequent max_vocab_size tokens
            if self.max_vocab_size > -1 and counter >= self.max_vocab_size: break

        # add out of vocab token and pad token
        d["<UNK>"] = counter
        counter += 1
        d["<PAD>"] = counter
        print("vocab mapping created: size: %d discarded: %d" % (len(d), len(dic) - len(d) + 2))
        with open(self.dataDir + 'vocab.txt', 'wb') as handle:
            pickle.dump(d, handle)
Example #3
0
 def createProcessedDataFile(self, vocab_mapping, directory, pid, max_seq_length, lock):
     count = 0
     data = np.array([i for i in range(max_seq_length + 2)])
     for f in os.listdir(directory):
         count += 1
         if count % 100 == 0:
             lock.acquire()
             print("Processing: " + f + " the " + str(count) + "th file... on process: " + str(pid))
             lock.release()
         with open(os.path.join(directory, f), 'r') as review:
             tokens = tokenizer.tokenize(review.read().lower(), self.remove_punct, self.remove_stopwords)
             numTokens = len(tokens)
             indices = [vocab_mapping.getIndex(j) for j in tokens]
             # pad sequence to max length
             if len(indices) < max_seq_length:
                 indices = indices + [vocab_mapping.getIndex("<PAD>") for i in range(max_seq_length - len(indices))]
             else:
                 indices = indices[0:max_seq_length]
         if "pos" in directory:
             indices.append(1)
         else:
             indices.append(0)
         indices.append(min(numTokens, max_seq_length))
         assert len(indices) == max_seq_length + 2, str(len(indices))
         data = np.vstack((data, indices))
         indices = []
     # remove first placeholder value
     data = data[1::]
     lock.acquire()
     print("Saving data file{0} to disk...".format(str(pid)))
     lock.release()
     self.saveData(data, pid, directory)
    def get_tokenized_sentences(self):
        instances = []
        for idx, sent in enumerate(self.sentences):
            tokens = tk.tokenize(sent.text)
            cleaned_tokens = [clean_str(token) for token in tokens]

            instances.append(
                TokenizedInstance(sent.text, self.labels[idx], cleaned_tokens))
        return instances
Example #5
0
    def createCorpus(self):
        corpus = ""
        for dir in self.vocabDirs:
            print("\tNow processing folder: " + dir)

            for f in os.listdir(dir):
                with open(os.path.join(dir, f), 'r') as review:
                    review_tkn = tokenizer.tokenize(review.read(), self.remove_punct, self.remove_stopwords)
                    corpus += " ".join(review_tkn) + "\n"

        # name_corpus = "corpus{p}{s}".format(p="_nopunct" if args.punct else "", s="_nostop" if args.stop else "")
        with open(self.dataDir + "corpus.txt", "w") as text_file:
            text_file.write(corpus)
            text_file.close()
    def extract_features(self):
        instances = []
        for text in self.sentences:
            tokens = tk.tokenize(text)

            # list to hold features for text
            feature_list = []

            # check for features in tokens
            # SUFFIX3, PREFIX2, WORD/W, ALL_CAPS, EXCLAMATION or if a word matches the labels, i.e. joy , anger etc.
            for idx, token in enumerate(tokens):
                # suffix3
                feature_list.append("SUFFIX3=" + token[-3:])
                # prefix2
                feature_list.append("PREFIX2=" + token[:2])
                # w
                feature_list.append("W=" + token)
                # window = 2
                feature_list.append("WORD+1=" +
                                    tokens[(idx + 1) % len(tokens)])
                feature_list.append("WORD-1=" +
                                    tokens[(idx - 1) % len(tokens)])

                feature_list.append("WORD+2=" +
                                    tokens[(idx + 2) % len(tokens)])
                feature_list.append("WORD-2=" +
                                    tokens[(idx - 2) % len(tokens)])

                if token.isupper():
                    feature_list.append('ALL_CAPS=' + token)
                if "!" in token:
                    feature_list.append('EXCLAMATION=1')
                if token in self.labels:
                    feature_list.append('LABEL=' + token)

            # create DataInstance
            index_of_text = self.sentences.index(text)
            data_instance = DataInstance(text=text,
                                         label=self.labels[index_of_text],
                                         features=feature_list)
            instances.append(data_instance)

        return instances
    def count_words_in_labels(self):
        wordcount_per_labels = dict(
            zip(self.distinct_labels,
                [{} for _ in range(len(self.distinct_labels))]))

        for idx, sent in enumerate(self.sentences):
            label = self.labels[idx]

            cleared_tokens = [
                clean_str(token) for token in tk.tokenize(sent.text)
            ]
            for cleared_token in cleared_tokens:
                wordcount_per_labels[label][
                    cleared_token] = wordcount_per_labels[label].get(
                        cleared_token, 0) + 1

        wordsum_per_labels = dict(
            zip(self.distinct_labels, [
                sum(wordcount_per_labels[label].values())
                for label in self.distinct_labels
            ]))
        return wordcount_per_labels, wordsum_per_labels