def hi_stem1(tokens):
     if tokens not in stop_words:
         token = ''
         a = []
         for token in tokens.split(" "):
             a.append(hi_stem(token))
         return (a)
     else:
         a = []
         a.append(hi_stem(tokens))
         return (a)
Example #2
0
def tokenizer_hindi(document, sentence):
    stopWords = getHindiStopWords()
    text = cfg.clean_text(sentence)
    if len(text) < 1:
        return []
    tokens = Text(text)
    tokens = [hi_stem(tkn) for tkn in tokens.words]
    tokens = [t for t in tokens if t not in stopWords]
    for token in tokens:
        if token in document.stemSentenceMap:
            document.stemSentenceMap[token].append(sentence)
        else:
            document.stemSentenceMap[token] = [sentence]
    return tokens
Example #3
0
def fileringSentences(fName, lemm=True, stemm=True, stop=True,lan = "english"):
  if stemm:
    if lan in ["spanish","english"]:
        stemmer = SnowballStemmer(lan)
  with codecs.open(fName, 'r',encoding="utf-8") as f:
    f_start = fName.replace(".conf","")
    ending = ""
    if stemm:
        ending += "_stemmed"
    if lemm:
        ending += "_lemmed"
    if stop:
        ending += "_stop"
        if lan in ["spanish","english"]:
            stop_words = set(stopwords.words(lan))
    if not(stemm) and not(lemm):
        ending += "_raw"
    ending += ".conf"
    with codecs.open(f_start+ending,"w",encoding="utf-8") as write_file:
      for line in f:
        l = line.split(",")
        l2 = line.replace(l[0]+ ",",'')
        l3 = l2.replace("-"," ")
        #l3 = re.sub('[^A-Za-z0-9\ ]+', '', l3)
        for rem in ["\n","\t","\r",".","?","!",u'¿',u'¡',u'।',u"\u0964","|"]:
           l3 = l3.replace(rem,"")
        for pun in string.punctuation:
           l3 = l3.replace(pun,"")
        l3 = l3.lower()
        while "  " in l3:
            l3 = l3.replace("  "," ")

        if(line != "" and l3 != "") :
            filtered_sentence = l3.split(" ")
        if stop:
            filtered_sentence = [w for w in filtered_sentence if not w in stop_words]
        if stemm:
            if lan == "hindi":
                filtered_sentence = [hi_stem(w) for w in filtered_sentence]
            else:
                filtered_sentence = [stemmer.stem(w) for w in filtered_sentence]
				
        if lemm:
            filtered_sentence = [lemmatizer.lemmatize(w) for w in filtered_sentence]
        if len(filtered_sentence) > 0:
            out_string = l[0]+ "," + " ".join(filtered_sentence)
            write_file.write(out_string+"\n")
            #print out_string
    print lan,lemm,stemm,stop
Example #4
0
def search(word):
    check_for_db()
    conn = sqlite3.connect("words.db")
    cursor = conn.cursor()
    test = "'" + word + "'"
    search_table = "select * FROM words WHERE word=" + test
    cursor.execute(search_table)
    x = cursor.fetchall()
    if (len(x) != 0):
        print("word found in the database:", x)
    else:
        x = hindi_stemmer.hi_stem(word)
        insert_text = "INSERT INTO words (word,stemmed) VALUES (\'%s\', \'%s\')" % (
            word, x)
        cursor.execute(insert_text)
        conn.commit()
        print("word not found in the database:", x,
              "inserted into the database")
def Remove_stem(tweet):
    tweet_list = tweet.split()
    tweet_list = [hs.hi_stem(word) for word in tweet_list]
    tweet_new = " ".join(tweet_list)
    return tweet_new
Example #6
0
def tokenizer_hindi(document):
    tokens = Text(cfg.clean_text(document))
    tokens = [hi_stem(tkn) for tkn in tokens.words]
    return tokens