def wordcount(text, logger=xetrapal.astra.baselogger):
    #matches 30 reetitions or more of one character
    #regex2 = re.compile(u'(^.{30,})')
    #regex3 = re.compile(u'(\A\u002D)|(\u002D\Z)')
    # create dictionary to store word frequencies
    # process each file chunk
    # remove special characters and anything beyond Unicode 382
    #preCleanText = regex1.sub(' ', decodedText)
    # parse text
    #parsedText = re.split(' ', text)
    wordFreq = collections.Counter()
    t = Tokenizer(text)
    logger.info("Beginning generate word count on input")
    logger.info("Tokenizing the input")
    t.tokenize()
    parsedText = t.tokens

    # clean up and count word
    while "" in parsedText:
        parsedText.remove("")
    for word in tqdm(parsedText):
        if word == '':
            continue
        # add word to count
        wordFreq[word] += 1

    return wordFreq
Ejemplo n.º 2
0
def sent_tokenize(fileid=None):
    token_list = []
    for text in raw(fileid):
        t = Tokenizer(text)
        t.generate_sentences()
        token_list.append(t.sentences)

    return token_list
Ejemplo n.º 3
0
def tokenize(fileid=None, remove_stopwords=False):
    token_list = []
    for text in raw(fileid):
        t = Tokenizer(text)
        t.tokenize()
        if remove_stopwords:
            t.remove_stop_words()
            token_list.append(t.final_tokens)
        else:
            token_list.append(t.tokens)

    return token_list
Ejemplo n.º 4
0
def wordcount(text,logger=xetrapal.astra.baselogger):
    wordFreq = collections.Counter()
    t=Tokenizer(text)
    logger.info("Beginning generate word count on input")
    logger.info("Tokenizing the input")
    t.tokenize()
    parsedText = t.tokens
    
    # clean up and count word
    while "" in parsedText:
        parsedText.remove("")
    for word in tqdm(parsedText):
		if word == '':
			continue
		# add word to count
		wordFreq[word] += 1     
    return wordFreq
Ejemplo n.º 5
0
def ngramfrequencyht(filename,gramlength=3,logger=xetrapal.astra.baselogger):
    with open(filename,"r") as f:
        intext=f.read()
    logger.info("Read file "+ filename)   
    logger.info("Cleaning text")
    cleantext=intext.replace("\nENDOFTWEET\n","\n")
    cleantext=cleantext.lower()
    cleantext=tweet_cleaner(cleantext)
    cleantext=re.sub("\ +"," ",cleantext)
    logger.info("Tokenizing input")
    t=Tokenizer(cleantext)
    t.tokenize()
    grams=nltk.ngrams(t.tokens,gramlength)
    logger.info("Generating freq distribution")
    fdist=nltk.FreqDist(grams)
    freqdist={}
    for k,v in fdist.items():
        freqdist[" ".join(k)]=v
    logger.info("Returning final values")
    freqdistdf=pandas.Series(freqdist).to_frame()
    return freqdistdf
Ejemplo n.º 6
0
def tokenize():
    t = Tokenizer("यह वाक्य हिन्दी में है।")
    t.tokenize()
    return t
Ejemplo n.º 7
0
    f = codecs.open("stopwords.txt", encoding='utf-8')
    stopwords = [x.strip() for x in f.readlines()]
    tokens = [i for i in list if unicode(i) not in stopwords]
    return tokens


texts = []
documents = {}

for i in os.listdir("Reviews"):
    if i.endswith(".txt"):
        with open("Reviews\\" + i) as f:
            documents[i] = []
            for line in f:
                l = line.split('#####')[0]
                t = Tokenizer(l)
                t.generate_sentences()
                for s in t.sentences:
                    if not s.strip() == '':
                        documents[i].append(s)
                t.tokenize()
                tokens = removeStopWords(t.tokens)
                # qwe.extend(tokens)
                texts.append(tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.ldamodel.LdaModel(corpus,
                                        num_topics=9,
                                        id2word=dictionary,
                                        passes=100)
val = model.print_topics(num_topics=8, num_words=10)
Corpus = pd.read_csv("train_hindi.tsv",
                     encoding='utf-8',
                     sep="\t",
                     names=header_list)
Corpus['text'] = [
    re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry)
    for entry in Corpus['text']
]
Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']]
Corpus['text'] = [entry.split() for entry in Corpus['text']]

for index, entry in enumerate(Corpus['text']):
    Final_words = []
    for word in entry:
        if word not in hindi_stopwords:
            t = Tokenizer()
            Final_words.append(t.generate_stem_words(word))
    Corpus.loc[index, 'text_final'] = str(Final_words)

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(
    Corpus['text_final'], Corpus['task_1'], test_size=0.3)
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
    f = open("data/eng-hin-modified.txt", "r+")
    s = f.readlines()
    f.close()

    sentences = []

    # tokenize the whole thing into sentences
    for line in s[1:2000]:
        t_ = sent_tokenize(line, delim)
        t_ = [x for x in t_ if x != "\n"]
        sentences += t_

    # tokenize the whole thing into words
    words = []
    for sent in sentences:
        tok_ = Tokenizer(sent)
        tok_.tokenize()
        words += tok_.tokens

    unigrams = unigrammatize(words)
    unigrams = freq_sorted_unigrams(unigrams)

    #stopwords = []
    for gram in unigrams:
        print gram[0].decode("utf-8")
    #    if gram[1] > 270:
    #        stopwords.append(gram[0])
    #    else:
    #        break
    #for stop in stopwords:
    #    print stop.decode("utf-8")