def how_many_questions(q,sentences,stemmed_sentences,sentence_vec): search_string = "how many {!VB*+} VB*" m = search(search_string, q) print m first_part =[w.string for w in m[0].words[2:-1]] search_string = "VB* *+" main_part = [lemma(first_part[0])] m = search(search_string, q) second_part = [w.string for w in m[0].words[1:]] answer = first_part + second_part stem_answer = " ".join([utils.stemm_term(w).lower() for w in answer]) stem_vector = pp.text_to_vector (stem_answer) possible_answers,ans_idx = get_possible_answers(sentences,stem_vector, stemmed_sentences,sentence_vec,main_part) answered = False index = 0 steps_to_rewind = 5 while not answered and index < len(possible_answers): current_sentence = sentences[ans_idx[index]].split() curr = possible_answers[index].split() main_idx = curr.index(main_part[0]) if main_idx == -1: index+=1 continue num_idx = -1 for i in range(steps_to_rewind): if main_idx - (i+1) >= 0: if english_pack.is_number(curr[main_idx- (i+1)]): num_idx = main_idx - (i+1) break else: break if num_idx != -1: end_idx = num_idx+1 while num_idx > 0 and english_pack.is_number(curr[num_idx-1]): num_idx-=1 number = " ".join(current_sentence[num_idx:end_idx]) if number != "," and number != ".": print fix_punctuation(" ".join([w.string for w in q.words])) print fix_punctuation(number + " " + " ".join(first_part) + ".") answered = True break index+=1 return answered
def checkTypeWordCount(answer,question): count = 0 status = '' sum = 0 status1 = 'false' for word1 in word_tokenize(answer): if word1 == '.' or word1 == ',' or word1 == '\'' or word1 == '\"' or word1 == ':' or word1 == ';' or word1 == '?' or word1 == '/' or word1 == '\\' or word1 == '|' or word1 == ']' or word1 == '[' or word1 == '}' or word1 == '{' or word1 == '(' or word1 == ')' or word1 == '*' or word1 == '&' or word1 == '^' or word1 == '%' or word1 == '$' or word1 == '#' or word1 == '@' or word1 == '!' or word1 == '`' or word1 == '~' or word1 == '-' or word1 == '_' or word1 == '='or word1 == '+': print 'error' else: sum = sum +1 #print word1 print sum words_ans = word_tokenize(answer) words_qus = word_tokenize(question) if words_ans[0]=="NOTICE"or words_ans[0]=="Notice": print "Correct" count = count+0.25 else: status = "Wrong" for word in words_qus: if en.is_number(word) and words_qus[words_qus.index(word)+1]== 'words': if sum >= word: print word count = count+0.25 status1='true' if status1 == 'false': count = count+0.25 return count,status
def find_grammatical_kind(self): st = self.get_sentence() st = re.sub(",", "", st) # delete all commas result = [] m = st.split(" ") for each in m: flag = False if en.noun.is_emotion(each): result.append("emotion") flag = True elif en.is_connective(each): result.append("connective") flag = True elif en.is_verb(each): result.append("verb") flag = True elif en.is_adjective(each): result.append("adjective") flag = True elif en.is_noun(each): result.append("noun") flag = True elif en.is_persuasive(each): result.append("persuasive") flag = True elif en.is_number(each): result.append("number") flag = True if flag == False: result.append("unclear") return result
def when_questions(curr,curr_idx,current_sentence, full_sentence,answer): already_seen = set(answer.split()) initial_size = len(answer.split()) if have_seen(already_seen,current_sentence,curr_idx): answer += " " + current_sentence[curr_idx] curr_idx+=1 tagged_curr = parsetree(full_sentence).words date_preposition = set(["in","at","on"]) #look for the NP after the prep. while not (is_date(tagged_curr[curr_idx].string) or tagged_curr[curr_idx].type == "."): if not have_seen(already_seen,current_sentence,curr_idx): answer+= " " + current_sentence[curr_idx] curr_idx +=1 while curr_idx < len(curr) and (is_date(tagged_curr[curr_idx].string)\ or tagged_curr[curr_idx].type == "DT" or tagged_curr[curr_idx].type == ","): if not have_seen(already_seen,current_sentence,curr_idx): answer+= " " + current_sentence[curr_idx] curr_idx += 1 answer = answer.split() curr_idx -= 1 while not is_date(tagged_curr[curr_idx].string) and len(answer) > initial_size: del answer[-1] curr_idx-= 1 if len(answer) == initial_size: return False,"" answer[0] = answer[0].title() found_date = False found_prep = False for w in answer: if is_date(w): found_date = True if w in date_preposition: found_prep = True if not found_date: return False, "" if not found_prep: first_part = answer[:initial_size] last_part = answer[initial_size:] if english_pack.is_number(last_part[0]) and int(last_part[0]) < 32: answer = first_part + ["on"] + last_part else: answer = first_part + ["in"] + last_part answer = " ".join(answer) if not "." in answer: answer+= "." return True, answer
def isNumber(text): text = nltk.word_tokenize(text.lower()) if len(text)==0: return False answer = True for word in text: if word not in ['a', 'and'] and not en.is_number(word): answer = False break return answer
def exp2words(expstr): expstr = nltk.word_tokenize(expstr.lower()) text = '' operators = {'+': 'plus', '-': 'minus', '/': 'divided by', '*': 'multiplied by', '=': 'is equal to', '**': 'to the power of', 'log':'the logarithm of'} lw = None for w in expstr: if lw!=None: if en.is_number(lw): text += en.number.spoken(lw)+' ' elif lw=='*' and w=='*': w = '**' elif lw in operators: text += operators[lw]+' ' lw = w if en.is_number(lw): text += en.number.spoken(w)+' ' elif lw=='*' and w=='*': w = '**' elif lw in operators: text += operators[lw]+' ' return text.strip()
def word_count(answer,question): words_qus = word_tokenize(question) count = 0 status1 = 'false' for word in words_qus: if en.is_number(word) and words_qus[words_qus.index(word)+1]== 'words': if sum_of_words(answer) >= word: print word count = count+0.5 status1 = 'true' if status1 == 'false': count = count+0.5 return count
def words2exp(text): #text = nltk.word_tokenize(text.lower()) #alreadyExp = True #for word in text: # if not (word.isdigit() or word in ['+', '-', '*', '/', '**', '(', ')']): # alreadyExp = False # break #if alreadyExp and len(text)>0: # return ' '.join(text) isExp = False try: isExp = en.is_number(str(sympy.simplify(text).evalf())) except Exception, e: isExp = False
import en print en.spelling.correct('zer0') print en.spelling.suggest('tw0') print en.is_number('eight') import string print string.punctuation """ /usr/lib/spark/bin/spark-submit \ --master yarn-client \ --conf "spark.yarn.executor.memoryOverhead=8192" \ --conf "spark.shuffle.memoryFraction=0.5" \ --executor-memory 10g --executor-cores 2 --num-executors 5 \ --py-files wedc-lib.zip,scipy-0.18.0.dev0_5a779fd-py2.7-linux-x86_64.egg,scikit_learn-0.18.dev0-py2.7-linux-x86_64.egg,Cython-0.24-py2.7-linux-x86_64.egg,nltk-3.2.1-py2.7.egg,nose2-0.6.2-py2.7.egg,pyenchant-1.6.7-py2.7.egg,digSparkUtil-1.0.23-py2.7.egg,inflection-0.3.1-py2.7.egg,numpy-1.11.0-py2.7-linux-x86_64.egg \ spark_entrance.py \ $@ """
#! /usr/bin/python2 # -*- coding: utf-8 -*- import en # This file just runs some tests to see if en is working. # To run it, cd to the directory just above en, then # python2 < _en-test.py # LEXICAL CATEGORIZATION ############################################################ # Returns True when the given value is a number. print(1, en.is_number(12)) print(2, en.is_number("twelve")) # Returns True when the given string is a noun. # You can also check for is_verb(), is_adjective() and is_adverb(). print(3, en.is_noun("banana")) # Returns True when the given string is a tag, # for example HTML or XML. print(4, en.is_tag("</a>")) # Return True when the string is a HTML tag, # for example <a> or <body>. print(5, en.is_html_tag("</person>")) # COMMONSENSE ####################################################################### # Returns True if the given word expresses a basic emotion: # anger, disgust, fear, joy, sadness, surprise.
x = int(x) except: pass if lazy == 0: temp += en.number.spoken(x) lazy = 1 else: temp += ' ' + en.number.spoken(x) text = temp convert(text) else: print text else: print text if text == '***': started = 1 text = raw_input() print en.is_number("twelve") print en.is_basic_emotion("cheerful")