def sent_tokenize(fileid=None): token_list = [] for text in raw(fileid): t = Tokenizer(text) t.generate_sentences() token_list.append(t.sentences) return token_list
# -*- coding: utf-8 -*- ''' Tokeniser for hindi ''' from HindiTokenizer import Tokenizer import sys if __name__ == "__main__": if len(sys.argv) < 3: sys.stderr.write("Usage: " + sys.argv[0] + " <corpusfile> <outputfile>\n " ) sys.exit(2) file_name = sys.argv[1] fopen = open(file_name, "r") a = open(sys.argv[2], "w") while True: line = fopen.readline()[0:-1] if line == '': break else: t = Tokenizer(line) t.generate_sentences() for i in t.print_sentences(): a.write(i+"\n") a.close() fopen.close()
stopwords = [x.strip() for x in f.readlines()] tokens = [i for i in list if unicode(i) not in stopwords] return tokens texts = [] documents = {} for i in os.listdir("Reviews"): if i.endswith(".txt"): with open("Reviews\\" + i) as f: documents[i] = [] for line in f: l = line.split('#####')[0] t = Tokenizer(l) t.generate_sentences() for s in t.sentences: if not s.strip() == '': documents[i].append(s) t.tokenize() tokens = removeStopWords(t.tokens) # qwe.extend(tokens) texts.append(tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] model = gensim.models.ldamodel.LdaModel(corpus, num_topics=9, id2word=dictionary, passes=100) val = model.print_topics(num_topics=8, num_words=10) print val
PATH = '../Data/' tokenizer = TokenizeSentence('hindi') files = os.listdir(PATH) features = [] values = [] for file in files: if (os.path.isdir(PATH + file + '/')): for inner_file in os.listdir(PATH + file + '/'): if (os.path.isdir(PATH + file + '/' + inner_file + '/')): for inner_inner_file in os.listdir(PATH + file + '/' + inner_file + '/'): values.append(file) t = Tokenizer() t.read_from_file(PATH + file + '/' + inner_file + '/' + inner_inner_file) split_shit = t.generate_sentences() final_split_shit = [] for i in split_shit: hello = re.split('\?|\!', i) for k in hello: final_split_shit.append(k) filtered_final_split_shit = [] for i in final_split_shit: if (not (bool(re.match('^\s+$', i)))): filtered_final_split_shit.append(i) words = [] for i in filtered_final_split_shit: sentence_tokenized = tokenizer.tokenize(i) for k in sentence_tokenized: words.append(k.strip('\n')) length = [