def get_queries(self): queries, queries_1 = self.read_queries() tokenize = Tokenizer.Tokenize(" ") self.modified_queries = tokenize.process_data(queries) self.modified_queries_1 = queries_1 f = open("queries for lucene.txt", 'w') i = 0 for q in self.modified_queries: q = q.strip("\n") q = q.replace("\n", ' ') f.write(str(q)) f.write("\n")
def test(str): Tokenizer.Initialize(str) print(str) Tokenizer.Tokenize()
def start_tokenizing(self): tokenize = Tokenizer.Tokenize(self.source_path) # self.source_path self.new_source_path, self.N = tokenize.start_processing( ) # "tokenized_corpus", 3204