def extract_numbers(sent): sent_nums = [] i = 0 # print sent while i < len(sent): toke = sent[i] a_number = False try: itoke = int(toke) a_number = True except ValueError: pass if a_number: sent_nums.append((i, i + 1, int(toke))) i += 1 elif toke in number_words and annoying_number_word( sent, i): # get longest span (this is kind of stupid) j = 1 while i + j <= len(sent) and sent[ i + j] in number_words and annoying_number_word( sent, i + j): j += 1 try: sent_nums.append((i, i + j, text2num(" ".join(sent[i:i + j])))) except NumberException: sent_nums.append((i, i + 1, text2num(sent[i]))) i += j else: i += 1 return sent_nums
def create_questions(self, sentence, chunked): gaps = [] for word in chunked: if type(word) != tuple: target = [] for y in word: target.append(y[0]) orig_phrase = " ".join(target) if word.label() == "NUMBER": modified_phrase = orig_phrase[:] try: # convert spelled out word to numerical value modified_phrase = t2n.text2num(phrase) except: try: test = int(modified_phrase) + float(modified_phrase) except: # if the word could not be converted and # was not already numerical, ignore it continue if self.probably_range(modified_phrase): return gaps.append((word.label(), orig_phrase, modified_phrase)) elif word.label() in ["LOCATION", "PROPER"]: gaps.append((word.label(), orig_phrase, orig_phrase)) if len(gaps) >= 2 and len(gaps) == len(set(gaps)): gaps_filtered = [gap for gap in gaps if gap[0] == 'NUMBER' or gap[0] == 'LOCATION'] if len(gaps_filtered) and len(gaps) - len(gaps_filtered) > 2: self.quiz.add(QuestionSentence(sentence, gaps_filtered))
def get_freq_sequences(self, data_dir): big_map = defaultdict(int) with open(os.path.join(data_dir, "train.target"), 'r', encoding='utf-8') as f: for paragraph in f.readlines(): words = paragraph.split(' ') for i in range(0, len(words) - 2): li = words[i:i + 3] has_num = False for tok in li: num = '' try: num = int(tok) except: try: num = text2num(tok) except: pass if isinstance(num, int): has_num = True if not has_num: current_seq = ' '.join(li) big_map[current_seq] += 1 tokens = self.tokenizer.batch_encode_plus([ k for k, v in sorted( big_map.items(), key=lambda item: item[1], reverse=True) ][:75], return_tensors='pt') self.freq_seq = { tuple(x[1:3].tolist()): x[4] for x in tokens['input_ids'] } print(self.freq_seq)
def extract_numbers(sent) -> List[NumberSpan]: sent_nums = [] i = 0 while i < len(sent): toke = sent[i] if toke.isnumeric(): sent_nums.append(NumberSpan(i, i + 1, int(toke))) i += 1 elif toke in Keywords.number and not annoying_number_word( sent, i): # get longest span (this is kind of stupid) j = 1 while (i + j < len(sent) and sent[i + j] in Keywords.number and not annoying_number_word(sent, i + j)): j += 1 # corner cases: "x three - pointers", "eight nine turnovers" if j > 1 and sent[i + 2] in ["-", "'s", "turnovers"]: j = 1 try: sent_nums.append( NumberSpan(i, i + j, text2num(" ".join(sent[i:i + j])))) except NumberException: pass i += j else: i += 1 return sent_nums
def extract_summary_numbers(words): ignores = set([ "three point", "three - point", "three - pt", "three pt", "three - pointers", "three pointers", "three pointer" ]) numbers = [] idx = 0 while idx < len(words): is_number = False try: number_value = int(words[idx]) numbers.append((idx, idx + 1, words[idx], number_value)) idx += 1 continue except: pass for end_idx in range(min(idx + 5, len(words)), idx, -1): number_string = ' '.join(words[idx:end_idx]) try: number_value = text2num(number_string) numbers.append((idx, end_idx, number_string, number_value)) is_number = True idx = end_idx break except NumberException: if number_string in ignores: break if not is_number: idx += 1 return numbers
def extract_numbers(sent): sent_nums = [] i = 0 ignores = set(["three point", "three-point", "three-pt", "three pt"]) #print(sent) while i < len(sent): toke = sent[i] a_number = False try: itoke = int(toke) a_number = True except ValueError: pass if a_number: sent_nums.append((i, i+1, int(toke))) i += 1 elif toke in number_words and not annoying_number_word(sent, i): # get longest span (this is kind of stupid) j = 1 while i + j < len(sent) and sent[i + j] in number_words and not annoying_number_word(sent, i + j): j += 1 try: sent_nums.append((i, i+j, text2num(" ".join(sent[i:i+j])))) except NumberException: pass #print(sent) #print(sent[i:i+j]) #assert False i += j else: i += 1 return sent_nums
def convert_string_to_num(response): for token in response.split(' '): try: num_list = text2num(token) return num_list except NumberException: print('number exception when converting text to num')
def extract_numbers(sent): sent_nums = [] i = 0 #print sent while i < len(sent): toke = sent[i] a_number = False to_evaluate = toke.replace("/", "") # handle 1/3 try: itoke = float(to_evaluate) a_number = True except ValueError: pass if a_number: sent_nums.append((i, i + 1, toke)) i += 1 elif toke in number_words: # and not annoying_number_word(sent, i): # get longest span (this is kind of stupid) j = 1 while i + j < len(sent) and sent[ i + j] in number_words: # and not annoying_number_word(sent, i+j): j += 1 try: sent_nums.append((i, i + j, text2num(" ".join(sent[i:i + j])))) except NumberException: pass #print sent #print sent[i:i+j] #assert False i += j else: i += 1 return sent_nums
def build_sentence_info(timestamps, sentence, sent_dict): ''' Build sentence info from timestamps, sentence text and sentiment lexicon :param timestamps: :param sentence: :param sent_dict: :return: ''' # for test # print sentence h_en = Hyphenator('en_US') info_list = [] # words = re.split('\W+', sentence) words = re.split('[,.!?\r\n ]+', sentence) # print words # print len(words) # print len(timestamps) words.remove('') words_with_punct = sentence.split() for ind, word in enumerate(words): if word in sent_dict: c_sentiment = sent_dict[word] else: c_sentiment = 0 punct = '' if words_with_punct[ind] != word: punct = words_with_punct[ind][-1] num = t2n.text2num(word) info_list.append( (word, timestamps[ind * 2], timestamps[ind * 2 + 1], len(h_en.syllables(unicode(word))), c_sentiment, punct, num)) return info_list
def squareroot(x): try: x = float(x) except: x = float(t2n.text2num(x)) if 0 <= x: return x**(1. / 2.) return -(-x)**(1. / 2.)
def cuberoot(x): try: x = float(x) except: x = float(t2n.text2num(x)) if 0 <= x: return x**(1. / 3.) return -(-x)**(1. / 3.)
def text_normalize(string, convert2digit=True): text = preprocess_text(text=string, fix_unicode=False, lowercase=True, transliterate=False, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=False, no_contractions=True, no_accents=True, not_hashtag=True) if convert2digit: return text2num(text) else: return text
def numwordsen(arg): if not numwordsenPattern.match(arg) or len(arg) == 0: raise FunctionArgType(1, "numwordsen lexical error") elif numwordsNoPattern.match(arg): # match "no" or "none" return "0" try: return str(text2num(commaAndPattern.sub(" ", arg.strip().lower()))) # must be returned as a string except (NumberException, TypeError, ValueError) as ex: raise FunctionArgType(1, str(ex))
def process_item(item): try: item[0] = int(item[0]) except ValueError: try: item[0] = text2num(item[0]) except NumberException: pass return Item(*item)
def parse_ages(stories) : ''' Search for age information. stories is a list of 'story' strings returns a list of ages, same length as stories Newborns are considered to have age zero All non-year ages are quantized by taking a floor of the value i.e. 8 months is zero, 14 months is age 1, etc. 'few','a' are considered to be 1 unit ''' ages = [] count = 0 for story in stories : age_match = age_regex.search(story) newborn_match = newborn_regex.search(story) if age_match is not None : age_str = age_match.groups()[0] # Parse unit as ascii and make lowercase unit = age_match.groups()[1].encode('ascii','replace').lower() try : age_unitless = int(age_str) except ValueError : try : age_unitless = text2num(age_str.encode('ascii','replace').lower()) except : if age_str in AGE_STRINGS_AS_ONE: age_unitless = 1 else : # If problem parsing, assume it is a small number print 'Error parsing \'%s\' into a number, converting to zero' % age_str age_unitless = 0 print story count+=1 if unit in ['year','years'] : age = age_unitless elif unit in ['month','months'] : age = floor(float(age_unitless)/12.) #print '%d months converted to %d years' % (age_unitless,age) elif unit in ['week','weeks'] : age = floor(float(age_unitless)/52.) #print '%d weeks converted to %d years' % (age_unitless,age) elif unit in ['day','days'] : age = floor(float(age_unitless)/365.) #print '%d days converted to %d years' % (age_unitless,age) elif newborn_match is not None : age = 0 count+=1 else : age = None #print story ages.append(age) print 'Identified %d ages out of %d patients' % (count,len(stories)) return ages
def int_value(input): is_number = False try: value = int(input) is_number = True except ValueError: pass if not is_number: value = text2num(input) return value
def extract(text): tokens = prep(text) first = len(tokens) for i in range(len(tokens))[::-1]: #print(tokens[i] + " in valid: " + str(tokens[i] in valid_tokens)) if tokens[i] in valid_tokens: first = i else: break #print('_'.join(tokens[first:])) return text2num.text2num(tokens[first:])
def test_function(self): str_in = "No numbers in this sentence." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "No numbers in this sentence.") str_in = "I have eighty one apples." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "I have 81 apples.") str_in = "TWO HUNDRED THOUSAND DOLLARS IS WHAT YOU OWE ME!" str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "200000 DOLLARS IS WHAT YOU OWE ME!") str_in = "The number here is ten thousand three hundred forty one" str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "The number here is 10341") str_in = "There are six billion and five hundred fifty four million and nine hundred eleven thousand and three hundred twenty one people." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "There are 6554911321 people.") str_in = "Kenneth will get twenty two hundred apples while Keith gets sixty seven." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "Kenneth will get 2200 apples while Keith gets 67.") str_in = "I have seventeen cars, three hundred twenty seven servants, five thousand houses and two million and twenty three hundred dollars." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "I have 17 cars, 327 servants, 5000 houses and 2002300 dollars.") str_in = "Two hundred and eighty two melons are laying in that box." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "282 melons are laying in that box.") str_in = "This is a hundred hundred and a thousand thousand." str_out = text2num(str_in) print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out) assert(str_out == "This is a 100 100 and a 1000 1000.")
def test_get_a_story(self): numservice = NumberService() result = numservice.parse("11") print(result) self.assertEqual(1, text2num("one")) self.assertEqual(12, text2num("twelve")) self.assertEqual(72, text2num("seventy two")) self.assertEqual(300, text2num("three hundred")) self.assertEqual(1200, text2num("twelve hundred")) self.assertEqual(12304, text2num("twelve thousand three hundred four")) self.assertEqual(6000000, text2num("six million")) self.assertEqual(6400005, text2num("six million four hundred thousand five")) self.assertEqual( 123456789012, text2num( "one hundred twenty three billion four hundred fifty six million seven hundred eighty nine thousand twelve" )) self.assertEqual(4000000000000000000000000000000000, text2num("four decillion"))
def save_rep(): message_body = request.form['Body'] phone_number = request.form['From'] write = True df = pd.read_csv(path) if all([df.ix[i, 'tel'] != int(phone_number) for i in range(df.shape[0])]): try: message_body = int(message_body) except: message_body = message_body.lower().strip() if message_body in Small: message_body = text2num(message_body) elif message_body == 'chut': message_body = -1 #sursollicitation else: #message auto je n'ai pas compris votre reponse (redemander noter + STOP) print('message incompris') write = False resp = unknow_sms() if write: d = { 'tel': [phone_number], 'rep1': [message_body], 'rep2': [0], 'conversation': [message_body] } tmp = pd.DataFrame(d) df = df.append(tmp) df.to_csv(path, index=False) if message_body < 10 and message_body != 1: body = irc_q2() resp = MessagingResponse() resp.message(body) else: resp = rep_default() else: df.ix[df['tel'] == int(phone_number), 'rep2'] = message_body df.ix[df['tel'] == int(phone_number), 'conversation'] = str( df.ix[df['tel'] == int(phone_number), 'conversation'][0]) + '/' + str(message_body) df.to_csv(path, index=False) resp = rep_default() return resp
def durwordsen(arg): durWordsMatch = durwordsenPattern.match(arg) if not durWordsMatch or len(arg.strip()) == 0: raise FunctionArgType(1, "durwordsen lexical error") try: dur = 'P' durWordsMatchGroups = durWordsMatch.groups() for groupIndex, groupSuffix in ((1,"Y"), (61,"M"), (121, "D")): groupPart = durWordsMatchGroups[groupIndex] if groupPart and not durwordZeroNoPattern.match(groupPart): if groupPart.isnumeric(): dur += groupPart + groupSuffix else: dur += str(text2num(commaAndPattern.sub(" ", groupPart.strip().lower()))) + groupSuffix return dur if len(dur) > 1 else "P0D" # must have at least one number and designator except (NumberException, TypeError, ValueError) as ex: raise FunctionArgType(1, str(ex))
def normalize_numbers(line): ''' Gets a simple string and tries to match with certain aliases if numbers has been passed as English word Argument: line -- string we want to normalize Example: @input: "one + three - four" @output: "1 + 3 - 4" ''' result = "" #-- If line starts with (-) or (+) we keep this for the result -- if line[0] in ['+', '-']: result = line[0] line = line[1:] subline = line list_operations = [] while next_operator(subline): operation = next_operator(subline) position = subline.index(operation) subline = subline[position + 1:] list_operations.append(operation) subline = line for operation in operator_alias: subline = subline.replace(operation, '#') list_words = subline.split('#') for position in range(len(list_words)): word = list_words[position].strip() try: int(word) except ValueError: word = str(text2num.text2num(word)) result += word if position < len(list_operations): result += list_operations[position] return result
def normalize_numbers(line): ''' Gets a simple string and tries to match with certain aliases if numbers has been passed as English word Argument: line -- string we want to normalize Example: @input: "one + three - four" @output: "1 + 3 - 4" ''' result = "" #-- If line starts with (-) or (+) we keep this for the result -- if line[0] in ['+','-']: result = line[0] line = line[1:] subline = line list_operations = [] while next_operator(subline): operation = next_operator(subline) position = subline.index(operation) subline = subline[position+1:] list_operations.append(operation) subline = line for operation in operator_alias: subline = subline.replace(operation, '#') list_words = subline.split('#') for position in range(len(list_words)): word = list_words[position].strip() try: int(word) except ValueError: word = str(text2num.text2num(word)) result += word if position < len(list_operations): result += list_operations[position] return result
def extract_chunks(chunked, tags): exp = "" digit = "" for subtree in chunked.subtrees(filter=lambda t: t.label() in tags): for l in subtree.leaves(): print("l[0] -->>> ", str(l[0])) if str(l[0]) not in ["+", "-", "*", "/", "x", "X", "plus", "minus", "multiplied", "divided"]: digit += str(l[0]) + " " else: try: digit = str(t2n.text2num(digit[:-1])) digit += " " + str(l[0]) exp += " " + digit digit = "" except Exception as e: print("text2num error ->", e.args) if len(digit) > 0: exp += " " + digit return exp
def _get_words(phrase): phrase = phrase.lower() t = parsetree(phrase) words = [] for s in t: for chunk in s.chunks: if chunk.type == 'NP': for w in chunk.words: if w.type == "CD": try: int(w.string) words.append(w.string) except ValueError: try: words.append(text2num.text2num(w.string)) except text2num.NumberException: pass elif w.type == "NN": words.append(w.string.lower()) return ([unicode(w) for w in words])
def _get_words(phrase): phrase = phrase.lower() t = parsetree(phrase) words = [] for s in t: for chunk in s.chunks: if chunk.type == 'NP': for w in chunk.words: if w.type == "CD": try: int(w.string) words.append(w.string) except ValueError: try: words.append(text2num.text2num(w.string)) except text2num.NumberException: pass elif w.type == "NN": words.append(w.string.lower()) return([unicode(w) for w in words])
def text_to_num(text): tokenized = nltk.word_tokenize(text); tags = nltk.pos_tag(tokenized) print(tags) chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """ chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"): exp = "" for l in subtree.leaves(): exp += str(l[0]) + " " exp = exp[:-1] print(exp) try: text = text.replace(exp, str(t2n.text2num(exp))) except Exception as e: print("error text2num ->", e.args) print(text) return text
def evaluate(expression): BNF.expr_stack = [] expression, formatted_expression = text2num(expression) expression = expression.replace(u"\u00D7", "*") # X symbol expression = expression.replace(u"\u03c0", "PI") # Greek PI symbol # print "CONVERTED:", expression, formatted_expression res = BNF.get_bnf().parseString(expression) # print "STACK:", BNF.expr_stack[:] # print "RES: ", res, "STACK: ", BNF.expr_stack val, expr = evaluateStack(BNF.expr_stack[:]) #expr = " ".join(res) # print expr if isinstance(val, datetime.timedelta): val = "%d days %d hours %d minutes" % ( val.days, # IGNORE:E1103 val.seconds // 3600, val.seconds % 3600 // 60) return val, expr
def build_sentence_info(timestamps, sentence, sent_dict): ''' Build sentence info from timestamps, sentence text and sentiment lexicon :param timestamps: :param sentence: :param sent_dict: :return: ''' # for test # print sentence h_en = Hyphenator('en_US') info_list = [] # words = re.split('\W+', sentence) words = re.split('[,.!?\r\n ]+', sentence) # print words # print len(words) # print len(timestamps) words.remove('') words_with_punct = sentence.split() for ind, word in enumerate(words): if word in sent_dict: c_sentiment = sent_dict[word] else: c_sentiment = 0 punct = '' if words_with_punct[ind] != word: punct = words_with_punct[ind][-1] num = t2n.text2num(word) info_list.append((word, timestamps[ind * 2], timestamps[ind * 2 + 1], len(h_en.syllables(unicode(word))), c_sentiment, punct, num)) return info_list
def build_sentence_data(title, timestamps, sentence, sent_dict): ''' Build sentence info from timestamps, sentence text and sentiment lexicon :param timestamps: :param sentence: :param sent_dict: :return: a SentenceData object contain text-based information about the sentence ''' # for test # print sentence s = SentenceData(title, sentence) s.words = [] h_en = Hyphenator('en_US') words = re.split('[,.!?\r\n ]+', sentence) words.remove('') words_with_punct = sentence.split() for ind, word in enumerate(words): if word in sent_dict: c_sentiment = sent_dict[word] else: c_sentiment = 0 punct = '' if words_with_punct[ind] != word: punct = words_with_punct[ind][-1] num = t2n.text2num(word) if num == -1: num = '' else: num = str(num) w = WordData(word, float(timestamps[ind * 2]), float(timestamps[ind * 2 + 1]), c_sentiment, len(h_en.syllables(unicode(word))), punct, num) s.words.append(w) return s
def check(self, ans): if self.wo: try: ans = text2num.text2num(str(ans)) except: return False else: if self.op in SkillTestingQuestion.arith: try: ans = int(ans) except: return False else: try: if ans.lower() in ('true','yes','oui'): ans = True elif ans.lower() in ('false','no','non'): ans = False else: return False except: return False return ans == self.ans
def extract_summary_numbers(sent_words, ignore_numbers=''): ''' Extract textual numbers in the text but ignoring certain keywords (like "three pointers", "Elfmeter") Returns a list of tuples, which are composed of the start and end position of text numbers, the string that is the number and the value of the number. TODO: ++ What happens to non-int numbers? ''' ignores = [] numbers = [] idx = 0 #try to parse string numbers as int ("2" -> 2) while idx < len(sent_words): is_number = False try: number_value = int(sent_words[idx]) numbers.append((idx, idx + 1, sent_words[idx], number_value)) idx += 1 continue except: pass # try to parse written numbers to ints ("Two" -> 2) for end_idx in range(min(idx + 5, len(sent_words)), idx, -1): number_string = ' '.join(sent_words[idx:end_idx]) try: number_value = text2num(number_string) numbers.append((idx, end_idx, number_string, number_value)) is_number = True idx = end_idx break except NumberException: if number_string in ignores: break if not is_number: idx += 1 return numbers
def extract_direct_math_expressions(tags): exp = "" stack = [] counter = 0 isSubtract = False isSubtracted = False for word in tags: skip = False if "add" == word[0]: stack.append(" + ") elif "subtract" == word[0]: stack.append(" - ") isSubtract = True elif "multiply" == word[0]: stack.append(" * ") elif "divide" == word[0]: stack.append(" / ") elif "plus" == word[0] or "+" == word[0] or "added" == word[0]: exp += " + " elif "minus" == word[0] or "-" == word[0]: exp += " - " elif "multiplied" == word[0] or "*" == word[0] or "x" == word[ 0] or "X" == word[0]: exp += " * " elif "divided" == word[0] or "/" == word[0]: exp += " / " elif "subtracted" == word[0]: exp += " - " # isSubtracted = True return str(eval("abc")) if word[1] == "CD" and word[0] not in ["*", "x", "X", "/", "+", "-"]: if isSubtract and len(stack) != 2: try: stack.append(str(t2n.text2num(str(word[0])))) except: stack.append(word[0]) skip = True # elif isSubtracted: else: try: exp += str(t2n.text2num(str(word[0]))) except: exp += str(word[0]) # to check word numbers that are tagged as non 'CD' .... this is the issue with NLTK elif word[0] not in ["*", "x", "X", "/", "+", "-"]: if isSubtract and len(stack) != 2: try: stack.append(str(t2n.text2num(str(word[0])))) except: print("") skip = True else: try: exp += str(t2n.text2num(str(word[0]))) except: print("") if counter > 0 and len(stack) > 0 and not skip: if isSubtract: stack.reverse() exp += stack.pop() exp += stack.pop() isSubtract = False else: exp += stack.pop() if word[0] in [ "*", "x", "X", "/", "+", "-", "add", "subtract", "multiply", "divide", "added", "subtracted", "multiplied", "divided" ]: counter += 1 print("exp 2 -> ", exp) return str(eval(exp))
def process_doc(doc): for xmlfile in os.listdir(path + '/' + doc): axml = etree.parse(path + '/' + doc + '/' + xmlfile) rawfile = open(os.path.join(rawpath, doc ), 'r') text = rawfile.read() rawfile.close() dctfile = open(os.path.join(dctpath, doc, doc + ".dct"), 'r') dct = dctfile.read().rstrip() dctfile.close() try: dct = dprs.parse(dct) dctDayofWeek = dct.strftime('%A') except ValueError: dctDayofWeek = "" entities = dict() starts = dict() for entity in axml.findall('.//entity'): eid = entity.find('./id').text estart, eend = map(int, entity.find('./span').text.split(',')) etype = entity.find('./type').text eparentsType = entity.find('./parentsType') if eparentsType is not None: eparentsType = eparentsType.text else: eparentsType = tnschema[etype]["parentsType"] parentsType = etree.Element("parentsType") parentsType.text = eparentsType entity.append(parentsType) eproperties = entity.find('./properties') # Empty all links if eproperties is not None: for prop in eproperties.findall('./*'): eproperties.remove(prop) else: prop = etree.Element("properties") entity.append(prop) if estart not in starts: starts[estart] = list() ent_values = (eid, estart, eend, etype, eparentsType) starts[estart].append(eid) entities[eid] = ent_values links = dict() stack = list() entity_list = dict() lend = -1 for start in sorted(starts): for entity in starts[start]: (eid, estart, eend, etype, eparentsType) = entities[entity] if estart - lend > 10 and lend > -1: stack = list() entity_list = dict() lend = eend entity_list[eid] = (estart, eend, etype, eparentsType) ltype = "" stack_pointer = list() stack_pointer.extend(stack) while len(stack_pointer) > 0: s = stack_pointer.pop() stype = entity_list[s][2] ltype = get_relation(tnschema, etype, stype) if ltype != '': if eid not in links: links[eid] = dict() if ltype not in links[eid]: links[eid][ltype] = list() links[eid][ltype].append(s) else: ltype = get_relation(tnschema, stype, etype) if ltype != '': if s not in links: links[s] = dict() if ltype not in links[s]: links[s][ltype] = list() links[s][ltype].append(eid) stack.append(eid) for entity in axml.findall('.//entity'): eid = entity.find('./id').text etype = entity.find('./type').text estart, eend = map(int, entity.find('./span').text.split(',')) eproperties = entity.find('./properties') if etype in tnschema: for relation in tnschema[etype]: if relation != "parentsType": span = "".join(text[estart:eend]) if relation == "Type": ptype = span.title() if ptype == "About": ptype = "Approx" if etype in types: if span in types[etype]: ptype = types[etype][span] if etype == "Calendar-Interval" and ptype != "Unknown": if ptype.endswith("s"): ptype = ptype[:-1] elif etype == "Period" and ptype != "Unknown": if not ptype.endswith("s"): ptype += "s" ty = etree.Element(relation) ty.text = ptype eproperties.append(ty) elif relation == "Value": val = etree.Element(relation) span = re.sub(r'^0(\d)', r'\1', re.sub(r'^0+', '0', span)) span = str(text2num.text2num(span)) val.text = span eproperties.append(val) elif re.search('Interval-Type',relation): intervalemtpy = True if eid in links: if "Interval" in links[eid]: if links[eid]["Interval"] != "": intervalemtpy = False if not intervalemtpy: itype = etree.Element(relation) itype.text = "Link" eproperties.append(itype) else: itype = etree.Element(relation) itype.text = "DocTime" eproperties.append(itype) elif relation == "Semantics": sem = etree.Element(relation) sem.text = "Interval-Not-Included" eproperties.append(sem) else: notnull = False if eid in links: if relation in links[eid]: for child in links[eid][relation]: si = etree.Element(relation) si.text = child eproperties.append(si) notnull = True if tnschema[etype][relation][0] and not notnull: if eproperties.find('./' + relation) is None: si = etree.Element(relation) eproperties.append(si) if etype == "Last": semantics = eproperties.findall('./Semantics')[0] interval_included = "Interval-Not-Included" for repint in eproperties.findall('./Repeating-Interval'): if repint.text is not None: (rid, rstart, rend, rtype, rparentsType) = entities[repint.text] rspan = "".join(text[int(rstart):int(rend)]) if rspan.title() == dctDayofWeek: interval_included = "Interval-Included" semantics.text = interval_included if not os.path.exists(out_path + '/' + doc): os.makedirs(out_path + '/' + doc) axml.write(out_path + '/' + doc + '/' + xmlfile, pretty_print=True)
def nums(x): try: result = text2num.text2num(x) except: result = x return result
all_ents, players, teams, cities, total_players, total_teams, total_cities = get_ents(entry) box_score = entry["box_score"] player_name_map = {y: x for x, y in box_score['PLAYER_NAME'].items()} home_line_score = entry["home_line"] vis_line_score = entry["vis_line"] summary = entry['summary'] instance_count += 1 else: curr.append(line.strip()) args = line.split("|") name = args[0] record_type = args[2].strip() value = args[1] if not value.isdigit(): value = text2num(value) else: value = int(value) if record_type.startswith("PLAYER-"): record_type = record_type[len("PLAYER-"):] name = name.replace("UNK", "").strip() if name == 'Los Angeles' and 'LA' in total_cities: name = 'LA' if name in total_players: pass elif name in total_teams: pass elif name in players: name = resolve_name(name, total_players) elif name == 'Los Angeles Clippers' and 'LA Clippers' in total_teams:
def remUnits(text): if(text == ' ' or text.lower() == 'unknown' or text.lower() == 'to be determined'): return 'N/A' if('million' in text.lower()): pieces = text.replace('~', '').split() idx = pieces.index('million') - 1 million_mult = 1000000 try: return int(pieces[idx]) * million_mult except ValueError: return int(float(pieces[idx]) * million_mult) if('=' in text.lower() or 'version' in text.lower() or 'total' in text.lower() or '(' in text.lower() or '/' in text.lower() or ':' in text.lower() or 'of' in text.lower() or 'each' in text.lower() or 'per' in text.lower() or 'in' in text.lower() or '--' in text.lower()): temt = text.replace(',', '').replace('-', '').replace('/', ' / ')#text.replace('(', '').replace(')', '').replace('/', ' / ').replace(',', '') pieces = temt.split() nams = [] for guess in pieces: try: frag = int(guess) nams.append(frag) continue except ValueError: pass try: frag = float(guess) nams.append(frag) continue except ValueError: continue if(nams != []): if('=' in text.lower()): return nams[len(nams)-1] if('version' in text.lower()): return nams[0] if('/' in text.lower() or 'of' in text.lower() or 'each' in text.lower() or 'per' in text.lower()): try: return nams[0]*nams[1] except IndexError: return nams[0] elif('total' in text.lower() or '--' in text.lower() or '(' in text.lower()): return nams[0] elif(':' in text.lower() or 'in' in text.lower()): return sum(nams) text = re.sub(r'\([^)]*\)', '', text).replace(', ', ',') runningString = '' bool = True lastChar = '' for x in list(text): if(lastChar == 'Z'): if(x == '-'): break if x.isdigit() or x == ',' or x == '.': runningString += x bool = True else: if bool: runningString += ' ' bool = False lastChar = x #split by spaces pieces = runningString.split(' ') tot = 0 #look through each piece check = False for fragment in pieces: #first attempt is if in format of 'US = 500' if('=' in fragment): try: fragment = fragment.split('=')[1] grab = int(fragment) tot=tot+grab check = True continue except ValueError: pass if('.' in fragment): try: tot = tot+float(fragment) check = True continue except ValueError: pass if(',' in fragment): try: tot = tot+int(''.join(fragment.split(','))) check = True continue except ValueError: pass try: tot = tot+int(fragment) check = True except ValueError: continue if(check == True): return tot flag = False if(tot == 0): pieces2 = text.lower() pieces2 = pieces2.split(' ') for fragment in pieces2: try: tot = tot+text2num(fragment) flag = True except ValueError: pass if(flag and tot == 0): return 0 if(tot != 0): return tot if(tot == 0): return 'N/A' return tot
def square(x): try: return float(x)**2 except: return float(t2n.text2num(x))**2
import csv import json from text2num import text2num import StringIO if __name__ == '__main__': with open('rawdata/police_budget_fy14.csv') as f: police_data = f.read() budget_datas = [] reader = csv.reader(StringIO.StringIO(police_data)) for precinct, val in reader: try: precinct = precinct.lower().replace("precinct", "").replace( "precint", "").replace("precinc", "").strip() precinct_id = text2num(precinct) for m in range(1, 13): budget_datas.append({ "month": m, "precinct": precinct_id, "total": int(val) / 12, "type": "budget", "year": 2014 }) except: # TODO: for now, skip everything else, we are skipping about 7 pass print(json.dumps(budget_datas))
def takeorderfunction(): finalod = [] print( "INSTRUCTIONS: \n 1) Be clear \n 2) Mention Quantity, even for suborders \n 3) Avoid Repeating name for suborder" ) engine.say("hi. whats your name?") engine.runAndWait() while True: with sr.Microphone( ) as source: # use the default microphone as the audio source name = recog.listen(source) #recognise name try: name = str(recog.recognize_google(name)) break except sr.UnknownValueError: engine.say("Oops! Didn't catch that") engine.runAndWait() continue except sr.RequestError as e: mess = "Sorry Service is Unavailible at the moment" return mess engine.say("what would you like to eat?") engine.runAndWait() while True: with sr.Microphone( ) as source: # use the default microphone as the audio source order = recog.listen(source) # recognise order try: order = recog.recognize_google(order) print("You said " + order) # recognize speech using Google Speech Recognition''' #order=input() #remove after testing break except sr.UnknownValueError: engine.say("Oops! Didn't catch that") engine.runAndWait() continue except sr.RequestError as e: mess = "Sorry Service is Unavailible at the moment" return mess order = order.lower() for word in makelist.spacethings: order = order.replace(word, word.replace(" ", "_")) order = order.replace(" a ", " 1 ").replace(" a ", " 1 ") order = order.split() for k, v in enumerate(order): order[k] = str(text2num.text2num(str(v).lower())) #print(order) #remove after testing mess = "You said " + " ".join(order) #print(finalod) finalod.extend(makelist.makeorder(order)) print("before anyhting more", finalod) while True: engine.say("Anything more?") engine.runAndWait() while True: with sr.Microphone( ) as source: # use the default microphone as the audio source add = recog.listen(source) # recognise confirmtion try: add = str(recog.recognize_google(add)) if add == "yes" or add == "no": break else: engine.say("please answer with yes or no. Anything more?") engine.runAndWait() continue except sr.UnknownValueError: engine.say("Oops! Didn't catch that") engine.runAndWait() continue except sr.RequestError as e: mess = "Sorry Service is Unavailable at the moment" return mess if add == "yes": engine.say("what would you like to add to your order") engine.runAndWait() while True: with sr.Microphone( ) as source: # use the default microphone as the audio source aorder = recog.listen(source) # recognise order try: aorder = str(recog.recognize_google(aorder)) print( "You said " + aorder ) # recognize speech using Google Speech Recognition''' # order=input() #remove after testing break except sr.UnknownValueError: engine.say("Oops! Didn't catch that. Please try again.") engine.runAndWait() continue except sr.RequestError as e: mess = "Sorry Service is Unavailable at the moment" return mess aorder = aorder.lower() for word in makelist.spacethings: aorder = aorder.replace(word, word.replace(" ", "_")) aorder = aorder.replace(" a ", " 1 ").replace(" a ", " 1 ") aorder = aorder.split() for k, v in enumerate(aorder): aorder[k] = str(text2num.text2num(str(v).lower())) # print(order) #remove after testing mess = mess + " + " + " ".join(aorder) finalod.extend(makelist.makeorder(aorder)) if add == "no": break return finalod, mess, name
def atoi(text): tokens = prep(text) result = text2num.text2num(tokens) return result
def fix_tokenization(s): global full_name_cnt mwe_file = "/home/hongmin_wang/table2text_nlg/harvardnlp/data2text-harvard/mwes.json" with io.open(mwe_file, 'r', encoding='utf-8') as fmwe: tmp = json.load(fmwe) mwes = {k: v for k, v in tmp.items() if v > 1} full_names = {' '.join(k.split('_')): k for k, _ in mwes.items()} clean = [] for k, v in full_names.items(): if k in s: full_name_cnt += 1 s = s.replace(k, v) for w in s.split(): if w.endswith("s’"): w = ' '.join([w[:-1], "'"]) elif w.endswith("’s"): w = ' '.join([w[:-2], "'s"]) if re.search(p1, w): components = w.split('.') if len(components) == 2: print("Original {}".format(w)) w = ' . '.join(components) print("changed to {}".format(w)) if w.endswith('..'): print("Original {}".format(w)) w = '{} .'.format(components[0]) print("changed to {}".format(w)) if re.search(p2, w): print("Original {}".format(w)) num, suffix = re.findall(p2, w)[0] w = ' '.join([num, suffix]) print("changed to {}".format(w)) # fix tokenization errors caused by commas if re.search(p3, w): print("Original {}".format(w)) w = ''.join(w.split(',')) print("changed to {}".format(w)) if re.search(p4, w): print("Original {}".format(w)) w = ' , '.join(w.split(',')) print("changed to {}".format(w)) if re.search(p5, w): print("Original {}".format(w)) w = ' - '.join(w.split('-')) print("changed to {}".format(w)) if re.search(p6, w): print("Original {}".format(w)) pieces = re.findall(p6, w)[0] try: pieces[0] = text2num(pieces[0]) except: pass w = ' '.join(pieces) print("changed to {}".format(w)) if re.search(p7, w): pre = re.findall(p7, w)[0] print("Original {}".format(w)) w = ' '.join([pre, 'two_point']) print("changed to {}".format(w)) clean.append(w.strip()) result = ' '.join(clean) for k, v in post_fixes.items(): result = result.replace(k, v) return result
def input_help_to_vec(p): t = parsetree(p) requirements = [] mandatory = False # pprint(t) for sen in t: for i, chunk in enumerate(sen.chunks): if chunk.type == "ADJP": vector = copy.deepcopy(INPUT_VECTOR) for w in chunk.words: if w.type.startswith("JJ") and mandatory_similarity(w.string) > 0.9: mandatory = True if chunk.type == "NP": vector = copy.deepcopy(INPUT_VECTOR) adjv_nn_bridge = [] op = Operator() # 0 = and & 1 = or ignore = False # Useful when have DT like no etc.. for w in chunk.words: if w.type == "CD": try: op.get() vector["length"] = int(w.string) except ValueError: try: vector["length"] = text2num.text2num(w.string) except text2num.NumberException: pass elif w.type == "CC": ignore = False if w.string.lower() == "and": op.set(0) elif w.string.lower() == "or": op.set(1) elif w.type.startswith("NN"): similarities = [alphabet_similarity(w.string), capital_similarity(w.string), number_similarity(w.string)] m = max(similarities) m_index = similarities.index(m) if m > 0.9 and not ignore: if m_index == 0: if len(adjv_nn_bridge) == 0: adjv_nn_bridge.append(random.choice(list(string.lowercase))) vector["chars"] = operate(vector["chars"], adjv_nn_bridge, op) elif m_index == 1: vector["chars"] = operate(vector["chars"], [random.choice(list(string.uppercase))], op) elif m_index == 2: vector["chars"] = operate(vector["chars"], [random.choice([str(i) for i in range(0, 10)])], op) elif w.type.startswith("JJ"): similarities = [lowercase_similarity(w.string), uppercase_similarity(w.string), special_similarity(w.string)] m = max(similarities) m_index = similarities.index(m) if m > 0.9 and not ignore: if m_index == 0: adjv_nn_bridge = operate(adjv_nn_bridge, [random.choice(list(string.lowercase))], op) elif m_index == 1: adjv_nn_bridge = operate(adjv_nn_bridge, [random.choice(list(string.uppercase))], op) elif m_index == 2: adjv_nn_bridge = operate(adjv_nn_bridge, [random.choice(['!', '$'])], op) if vector["length"] == 0: vector["length"] = 1 else: op.get() # If there is a CC it gets cleaned because we couldn't identify the adjective elif w.type.startswith("DT"): if w.string.lower().startswith("no"): ignore = True requirements.append(vector) if mandatory and len(requirements) == 0: requirements.append({"length": 1, "chars": ['x']}) # Handling conjunctions at sentence level # Merging vectors based on 'and' and 'or' as of now l = [] last_chunk = None for w in t.words: if w.chunk == None and w.type.startswith("CC"): if w.string.lower() == "or": l.append(1) elif w.chunk and w.chunk.type == "NP": if last_chunk == None or (last_chunk != w.chunk): l.append(requirements.pop(0)) last_chunk = w.chunk final = [] i = 0 while i < len(l): if l[i] == 1: i += 2 else: if l[i]["length"] != 0 and len(l[i]["chars"]) > 0: final.append(l[i]) i += 1 return(final)