def wordcount(text, logger=xetrapal.astra.baselogger): #matches 30 reetitions or more of one character #regex2 = re.compile(u'(^.{30,})') #regex3 = re.compile(u'(\A\u002D)|(\u002D\Z)') # create dictionary to store word frequencies # process each file chunk # remove special characters and anything beyond Unicode 382 #preCleanText = regex1.sub(' ', decodedText) # parse text #parsedText = re.split(' ', text) wordFreq = collections.Counter() t = Tokenizer(text) logger.info("Beginning generate word count on input") logger.info("Tokenizing the input") t.tokenize() parsedText = t.tokens # clean up and count word while "" in parsedText: parsedText.remove("") for word in tqdm(parsedText): if word == '': continue # add word to count wordFreq[word] += 1 return wordFreq
def sent_tokenize(fileid=None): token_list = [] for text in raw(fileid): t = Tokenizer(text) t.generate_sentences() token_list.append(t.sentences) return token_list
def wordcount(text,logger=xetrapal.astra.baselogger): wordFreq = collections.Counter() t=Tokenizer(text) logger.info("Beginning generate word count on input") logger.info("Tokenizing the input") t.tokenize() parsedText = t.tokens # clean up and count word while "" in parsedText: parsedText.remove("") for word in tqdm(parsedText): if word == '': continue # add word to count wordFreq[word] += 1 return wordFreq
def ngramfrequencyht(filename,gramlength=3,logger=xetrapal.astra.baselogger): with open(filename,"r") as f: intext=f.read() logger.info("Read file "+ filename) logger.info("Cleaning text") cleantext=intext.replace("\nENDOFTWEET\n","\n") cleantext=cleantext.lower() cleantext=tweet_cleaner(cleantext) cleantext=re.sub("\ +"," ",cleantext) logger.info("Tokenizing input") t=Tokenizer(cleantext) t.tokenize() grams=nltk.ngrams(t.tokens,gramlength) logger.info("Generating freq distribution") fdist=nltk.FreqDist(grams) freqdist={} for k,v in fdist.items(): freqdist[" ".join(k)]=v logger.info("Returning final values") freqdistdf=pandas.Series(freqdist).to_frame() return freqdistdf
def tokenize(fileid=None, remove_stopwords=False): token_list = [] for text in raw(fileid): t = Tokenizer(text) t.tokenize() if remove_stopwords: t.remove_stop_words() token_list.append(t.final_tokens) else: token_list.append(t.tokens) return token_list
def tokenize(): t = Tokenizer("यह वाक्य हिन्दी में है।") t.tokenize() return t
f = codecs.open("stopwords.txt", encoding='utf-8') stopwords = [x.strip() for x in f.readlines()] tokens = [i for i in list if unicode(i) not in stopwords] return tokens texts = [] documents = {} for i in os.listdir("Reviews"): if i.endswith(".txt"): with open("Reviews\\" + i) as f: documents[i] = [] for line in f: l = line.split('#####')[0] t = Tokenizer(l) t.generate_sentences() for s in t.sentences: if not s.strip() == '': documents[i].append(s) t.tokenize() tokens = removeStopWords(t.tokens) # qwe.extend(tokens) texts.append(tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] model = gensim.models.ldamodel.LdaModel(corpus, num_topics=9, id2word=dictionary, passes=100) val = model.print_topics(num_topics=8, num_words=10)
Corpus = pd.read_csv("train_hindi.tsv", encoding='utf-8', sep="\t", names=header_list) Corpus['text'] = [ re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry) for entry in Corpus['text'] ] Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']] Corpus['text'] = [entry.split() for entry in Corpus['text']] for index, entry in enumerate(Corpus['text']): Final_words = [] for word in entry: if word not in hindi_stopwords: t = Tokenizer() Final_words.append(t.generate_stem_words(word)) Corpus.loc[index, 'text_final'] = str(Final_words) Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split( Corpus['text_final'], Corpus['task_1'], test_size=0.3) Encoder = LabelEncoder() Train_Y = Encoder.fit_transform(Train_Y) Test_Y = Encoder.fit_transform(Test_Y) Tfidf_vect = TfidfVectorizer(max_features=5000) Tfidf_vect.fit(Corpus['text_final']) Train_X_Tfidf = Tfidf_vect.transform(Train_X) Test_X_Tfidf = Tfidf_vect.transform(Test_X) SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') SVM.fit(Train_X_Tfidf, Train_Y) predictions_SVM = SVM.predict(Test_X_Tfidf)
f = open("data/eng-hin-modified.txt", "r+") s = f.readlines() f.close() sentences = [] # tokenize the whole thing into sentences for line in s[1:2000]: t_ = sent_tokenize(line, delim) t_ = [x for x in t_ if x != "\n"] sentences += t_ # tokenize the whole thing into words words = [] for sent in sentences: tok_ = Tokenizer(sent) tok_.tokenize() words += tok_.tokens unigrams = unigrammatize(words) unigrams = freq_sorted_unigrams(unigrams) #stopwords = [] for gram in unigrams: print gram[0].decode("utf-8") # if gram[1] > 270: # stopwords.append(gram[0]) # else: # break #for stop in stopwords: # print stop.decode("utf-8")
from HindiTokenizer import Tokenizer import sys if __name__ == "__main__": if len(sys.argv) < 2: sys.stderr.write("Usage: " + sys.argv[0] + " <corpusfile> <outputfile>\n") sys.exit(2) file_name = sys.argv[1] fopen = open(file_name, "r") a = open(sys.argv[2], "w") dic_tok = {} while True: line = fopen.readline()[0:-1] if line == '': break else: t = Tokenizer(line) t.tokenize() for i in t.print_tokens(): try: dic_tok[i] += 1 except KeyError: dic_tok[i] = 1 final_list = [] for i in dic_tok.items(): i = list(i) i.reverse() final_list.append(i) final_list.sort() final_list.reverse() final_list = final_list[0:50] for i in final_list:
# -*- coding: utf-8 -*- ''' Tokeniser for hindi ''' from HindiTokenizer import Tokenizer import sys if __name__ == "__main__": if len(sys.argv) < 3: sys.stderr.write("Usage: " + sys.argv[0] + " <corpusfile> <outputfile>\n " ) sys.exit(2) file_name = sys.argv[1] fopen = open(file_name, "r") a = open(sys.argv[2], "w") while True: line = fopen.readline()[0:-1] if line == '': break else: t = Tokenizer(line) t.generate_sentences() for i in t.print_sentences(): a.write(i+"\n") a.close() fopen.close()
def removeStopWords(list): f = codecs.open("stopwords.txt",encoding='utf-8') stopwords=[x.strip() for x in f.readlines()] tokens=[i for i in list if unicode(i) not in stopwords] return tokens texts = [] documents = {} for i in os.listdir("Reviews"): if i.endswith(".txt"): with open("Reviews\\"+i) as f: documents[i] = [] for line in f: l = line.split('#####')[0] t = Tokenizer(l) t.generate_sentences() for s in t.sentences: if not s.strip() == '': documents[i].append(s) t.tokenize() tokens = removeStopWords(t.tokens) # qwe.extend(tokens) texts.append(tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] model = gensim.models.ldamodel.LdaModel(corpus, num_topics=9, id2word = dictionary, passes=100) val = model.print_topics(num_topics=8, num_words=10) print val for value in val: a,b=value
from cltk.tokenize.sentence import TokenizeSentence import statistics import pickle PATH = '../Data/' tokenizer = TokenizeSentence('hindi') files = os.listdir(PATH) features = [] values = [] for file in files: if (os.path.isdir(PATH + file + '/')): for inner_file in os.listdir(PATH + file + '/'): if (os.path.isdir(PATH + file + '/' + inner_file + '/')): for inner_inner_file in os.listdir(PATH + file + '/' + inner_file + '/'): values.append(file) t = Tokenizer() t.read_from_file(PATH + file + '/' + inner_file + '/' + inner_inner_file) split_shit = t.generate_sentences() final_split_shit = [] for i in split_shit: hello = re.split('\?|\!', i) for k in hello: final_split_shit.append(k) filtered_final_split_shit = [] for i in final_split_shit: if (not (bool(re.match('^\s+$', i)))): filtered_final_split_shit.append(i) words = [] for i in filtered_final_split_shit: sentence_tokenized = tokenizer.tokenize(i)