コード例 #1
0
ファイル: app.py プロジェクト: RanaPriyam/SMS_spam_detector
def clean_text(message, stops):
    words = word_tokenize(message)
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = WordNetLemmatizer().lemmatize(w,
                                                       pos=get_simple_pos(
                                                           pos[0][1]))
            output_words.append(clean_word.lower())
    return " ".join(output_words)
コード例 #2
0
def get_bow1(file):
    f = open(file,'rb')
    content = f.read().splitlines()
    d = enchant.Dict("en_US")
    bowtemp = []
    for sentence in content:
        words = nltk.word_tokenize(str(sentence))
        for word,pos in nltk.pos_tag(words):
            if(len(word)>2):
                if(pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ'):
                    word = word.replace('0x96',"")
                    word = word.replace('.',"")
                    word = word.replace('0x94',"")
                    word = WordNetLemmatizer().lemmatize(word.lower(),pos = 'v')
                    #if(d.check(word)):
                        if bool(re.search(r'^[A-Z', word))==False:
                            bowtemp.append(word)
コード例 #3
0
def general_processing_file(list_text):
    stop_to_clearn = set(list(stopwords.words('english')))
    text_contain = []
    for index, text in enumerate(list_text):
        single_text = []
        for punctuate in string.punctuation:
            text = text.replace(punctuate, ' ')
        #print text
        for digit in string.digits:
            text = text.replace(digit, '')
        text = nltk.word_tokenize(text.decode('utf-8'))
        #print text
        for word_c in text:
            #print word_c
            if len(word_c) >= 3 and word_c not in stop_to_clearn:
                word_c = WordNetLemmatizer().lemmatize(word_c)
                word_c = word_c.lower()
                single_text.append(word_c)
        text_contain.append(' '.join(single_text))
    return text_contain
コード例 #4
0
def objective_generator(mystring):

    # text = word_tokenize(mystring)
    # POS_tag = nltk.pos_tag(text)

    # wordnet_lemmatizer = WordNetLemmatizer()
    # adjective_tags = ['JJ','JJR','JJS']
    # lemmatized_text = []

    # for word in POS_tag:
    #     if word[1] in adjective_tags:
    #         lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
    #     else:
    #         lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun

    # print ("Text tokens after lemmatization of adjectives and nouns: \n")
    # print (lemmatized_text)

    # for word in lemmatized_text:
    #     print (word+"-->"+WordNetLemmatizer().lemmatize(word,'v'))

    # front end
    Height = 600
    Width = 1200
    count = 0

    resultwindow = algo.Tk()
    resultwindow.state('zoomed')

    canvas = algo.Canvas(resultwindow, bg='green', width=Width)

    scroll_y = algo.Scrollbar(resultwindow,
                              orient="vertical",
                              command=canvas.yview)

    resultwindow.title("Objective Questions")
    frame1 = algo.Frame(canvas, bg='blue')
    frame1.place(relwidth=1, relheight=1)

    canvas.create_window(100, 100, anchor='nw', window=frame1, width=1350)

    T = algo.Text(frame1, font='Georgia', wrap=algo.WORD, padx=20, height=3)
    T.pack(anchor='n')
    T.insert(algo.END, mystring)

    print(mystring)

    stoppath = "data//stoplists//ourstopword.txt"
    rake_object = rake.Rake(stoppath, 5, 3, 4)
    sample_file = io.open("mcq.txt", 'r', encoding="iso-8859-1")
    text = sample_file.read()
    keywords = rake_object.run(text)
    # print(keywords)

    # following para is for stopword and arranging them in array
    with open('data//stoplists//ourstopword.txt', 'r') as file:
        stopword = file.read().replace('\n', ' ')
        stopwords = stopword.split()

    with open('mcq.txt', 'r') as file:
        string1 = file.read().replace('\n', '')
    # string1= open("mcq.txt",'r')
    splitstring = string1.split()
    list1 = []
    i = 0
    for word in splitstring:
        if word == 'is':
            x = i - 1
            # if splitstring[x-1] not in stopwords:
            list1.append(splitstring[x])
        i = i + 1
    # print(list1)

    # mystring =  '''Romeo and Juliet was written by William shakespeare. His name is Sandesh Sukubhattu.
    # lines of code is called program.  That program is named as that statue. the program with two lines of codes is named as binary program.
    # prime machine was developed in 1990s.'''

    # for question and .... (keyword)
    defword = [
        'is called', 'was called', 'named as', 'developed in', 'lunched in',
        'developed by', 'invented in', 'invented by', 'presented in',
        'presented by', 'viewed in', 'viewed by', 'called by', 'named by',
        'written by', 'read by'
    ]

    # for ....and question(keyword)
    beforedefword = [' is ', 'was', 'will']

    sentence = sent_tokenize(mystring)
    for onesentence in sentence:
        for onedefword in defword:
            before_keyword, keyword, after_keyword = onesentence.partition(
                onedefword)
            # print(before_keyword, after_keyword)
            after = after_keyword.split()
            before = before_keyword.split()

            t = []
            option = []

            for afterword in after:
                tokenwordcount = 0
                for beforeword in before:

                    text = word_tokenize(beforeword)
                    POS_tag = nltk.pos_tag(text)

                    wordnet_lemmatizer = WordNetLemmatizer()
                    adjective_tags = ['JJ', 'JJR', 'JJS']
                    lemmatized_text = []

                    for word in POS_tag:
                        if word[1] in adjective_tags:
                            lemmatized_text.append(
                                str(
                                    wordnet_lemmatizer.lemmatize(word[0],
                                                                 pos="a")))
                        else:
                            lemmatized_text.append(
                                str(wordnet_lemmatizer.lemmatize(
                                    word[0])))  #default POS = noun

                    # print ("Text tokens after lemmatization of adjectives and nouns: \n")
                    # print (lemmatized_text)

                    for word in lemmatized_text:
                        beforeword_lem = WordNetLemmatizer().lemmatize(
                            word, 'v')
                    # print(beforeword_lem)

                    if beforeword_lem.lower() not in stopwords:
                        tokenwordcount = tokenwordcount + 1
                if tokenwordcount > 1:

                    #lemitization start
                    text = word_tokenize(afterword)
                    POS_tag = nltk.pos_tag(text)

                    wordnet_lemmatizer = WordNetLemmatizer()
                    adjective_tags = ['JJ', 'JJR', 'JJS']
                    lemmatized_text = []

                    for word in POS_tag:
                        if word[1] in adjective_tags:
                            lemmatized_text.append(
                                str(
                                    wordnet_lemmatizer.lemmatize(word[0],
                                                                 pos="a")))
                        else:
                            lemmatized_text.append(
                                str(wordnet_lemmatizer.lemmatize(
                                    word[0])))  #default POS = noun

                    for word in lemmatized_text:
                        afterword_lem = WordNetLemmatizer().lemmatize(
                            word, 'v')
                    #lemitization end

                    if afterword_lem.lower() in stopwords:
                        break
                    else:
                        x = re.search("[,]|[.]|[!]", afterword)
                        if x:
                            y = 222
                            if re.search("[,]|[.]", afterword):
                                last = re.sub("[,]|[.]", "", afterword, 1, 0)
                                if last not in stopwords:
                                    t.append(last)
                                    print("keyword:", ' '.join(t))

                                    print(before_keyword, onedefword,
                                          "........")
                                    print("option a:", ' '.join(t))
                                    print("option b:", "something")

                                    option.append(' '.join(t))
                                    option.append('something1')
                                    option.append('something2')
                                    option.append('something3')

                                    random.shuffle(option)

                                    T = algo.Text(frame1,
                                                  font='Georgia',
                                                  wrap=algo.WORD,
                                                  padx=20,
                                                  height=3)
                                    T.pack(anchor='n')
                                    T.insert(algo.END, '\n')
                                    T.insert(algo.END, before_keyword)
                                    T.insert(algo.END, onedefword)
                                    T.insert(algo.END, '......')

                                    x = 1
                                    for options in option:

                                        radiobutton = algo.Radiobutton(
                                            frame1,
                                            text=options,
                                            bg='white',
                                            borderwidth=3,
                                            activeforeground='blue',
                                            variable=count,
                                            value=x,
                                            selectcolor='white',
                                            tristatevalue=0)
                                        radiobutton.pack(anchor='w',
                                                         side='top',
                                                         ipadx=300)
                                        x = x + 1

                                    count = count + 1

                                    T.insert(algo.END, ' \n')

                            break
                        else:
                            t.append(afterword)

    # print("\n")

        for onedefword in beforedefword:
            before_keyword, keyword, after_keyword = onesentence.partition(
                onedefword)
            # print(before_keyword, after_keyword)
            after = after_keyword.split()
            before = before_keyword.split()
            t = []
            numofstopinbefore = 0
            tokenwordcount = 0
            option = []
            for afterword in after:
                #lemitization start
                text = word_tokenize(afterword)
                POS_tag = nltk.pos_tag(text)

                wordnet_lemmatizer = WordNetLemmatizer()
                adjective_tags = ['JJ', 'JJR', 'JJS']
                lemmatized_text = []

                for word in POS_tag:
                    if word[1] in adjective_tags:
                        lemmatized_text.append(
                            str(wordnet_lemmatizer.lemmatize(word[0],
                                                             pos="a")))
                    else:
                        lemmatized_text.append(
                            str(wordnet_lemmatizer.lemmatize(
                                word[0])))  #default POS = noun

                for word in lemmatized_text:
                    afterword_lem = WordNetLemmatizer().lemmatize(word, 'v')
                #lemitization end
                if afterword_lem.lower() not in stopwords:
                    tokenwordcount = tokenwordcount + 1
            if tokenwordcount > 1:
                for beforeword in before:
                    #lemitization start
                    text = word_tokenize(beforeword)
                    POS_tag = nltk.pos_tag(text)

                    wordnet_lemmatizer = WordNetLemmatizer()
                    adjective_tags = ['JJ', 'JJR', 'JJS']
                    lemmatized_text = []

                    for word in POS_tag:
                        if word[1] in adjective_tags:
                            lemmatized_text.append(
                                str(
                                    wordnet_lemmatizer.lemmatize(word[0],
                                                                 pos="a")))
                        else:
                            lemmatized_text.append(
                                str(wordnet_lemmatizer.lemmatize(
                                    word[0])))  #default POS = noun

                    for word in lemmatized_text:
                        beforeword_lem = WordNetLemmatizer().lemmatize(
                            word, 'v')
                    #lemitization end
                    if beforeword_lem.lower() in stopwords:
                        numofstopinbefore = numofstopinbefore + 1
                    if numofstopinbefore == len(before):
                        break
                    else:
                        t.append(beforeword)
                if numofstopinbefore != len(before) and len(t) < 4:
                    print("........", keyword, after_keyword)
                    print("option a:", ' '.join(t))
                    print("option b:", "something")

                    r = ' '.join(t)

                    option.append(' '.join(t))
                    option.append('something1')
                    option.append('something2')
                    option.append('something3')

                    random.shuffle(option)

                    T = algo.Text(frame1,
                                  font='Georgia',
                                  wrap=algo.WORD,
                                  padx=20,
                                  height=3)
                    T.pack(anchor='n')
                    T.insert(algo.END, '\n')
                    T.insert(algo.END, '......')
                    T.insert(algo.END, keyword)
                    T.insert(algo.END, after_keyword)

                    x = 1
                    for options in option:
                        radiobutton = algo.Radiobutton(frame1,
                                                       text=options,
                                                       bg='white',
                                                       borderwidth=3,
                                                       activeforeground='blue',
                                                       variable=count,
                                                       value=x,
                                                       selectcolor='white',
                                                       tristatevalue=0)
                        radiobutton.pack(anchor='w', side='top', ipadx=300)
                        x = x + 1

                    count = count + 1

                    T.insert(algo.END, ' \n')

    canvas.update_idletasks()

    canvas.configure(scrollregion=canvas.bbox('all'),
                     yscrollcommand=scroll_y.set)

    canvas.pack(fill='both', expand=True, side='left')
    scroll_y.pack(fill='y', side='right')