return None def stemming(self, text): return None def auxillary_preprocess(self, text_with_rating): func_list = [ self.convert_lower_case, self.ignore_punctuation, self.tokenize, self.ignore_stopwords ] inp = text_with_rating[0] for func in func_list: inp = func(inp) return (inp, text_with_rating[1]) def process_text(self, text_generator, correct_spell=False, stem=False): for batch in text_generator: print batch yield map(self.auxillary_preprocess, batch) if __name__ == "__main__": from input_reader import InputReader input_reader = InputReader(1, 10) input_dir = "/home/rohittulu/Documents/aclImdb/train/pos/" text_generator = input_reader.get_batches(input_dir) text_processor = Textpreprocessor() for processed_batch in text_processor.process_text(text_generator): print processed_batch
to represent a document. """ def __init__(self, outfile): self.outfile = outfile def calculate(self, batch_generator): bow = set() with open(self.outfile, "w") as fp: for idx, batch in enumerate(batch_generator): if idx % 10 == 0: print("Batches read:" + str(idx)) for (tokens, rank) in batch: bow.update(tokens) for token in bow: fp.write(token) fp.write("\n") if __name__ == "__main__": bag_of_words_generator = BagOfWordsGenerator("bow.txt") from input_reader import InputReader from text_preprocessor import Textpreprocessor input_dir = "/home/rohittulu/Documents/aclImdb/train/pos/" input_reader = InputReader(10, -1) x = input_reader.get_batches(input_dir) text_preprocessor = Textpreprocessor() batch_generator = text_preprocessor.process_text(x) bag_of_words_generator.calculate(batch_generator) print("BOW CREATED")