Python text2numの例、text2num.text2num Pythonの例

コード例 #1

0

ファイルを表示

def extract_numbers(sent):
    sent_nums = []
    i = 0
    # print sent
    while i < len(sent):
        toke = sent[i]
        a_number = False
        try:
            itoke = int(toke)
            a_number = True
        except ValueError:
            pass
        if a_number:
            sent_nums.append((i, i + 1, int(toke)))
            i += 1
        elif toke in number_words and annoying_number_word(
                sent, i):  # get longest span  (this is kind of stupid)
            j = 1
            while i + j <= len(sent) and sent[
                    i + j] in number_words and annoying_number_word(
                        sent, i + j):
                j += 1
            try:
                sent_nums.append((i, i + j, text2num(" ".join(sent[i:i + j]))))
            except NumberException:
                sent_nums.append((i, i + 1, text2num(sent[i])))
            i += j
        else:
            i += 1
    return sent_nums

コード例 #2

0

ファイルを表示

    def create_questions(self, sentence, chunked):
        gaps = []
        for word in chunked:
            if type(word) != tuple:                
                target = []
                for y in word:
                    target.append(y[0])
                orig_phrase = " ".join(target)

                if word.label() == "NUMBER":
                    modified_phrase = orig_phrase[:]

                    try:
                        # convert spelled out word to numerical value
                        modified_phrase = t2n.text2num(phrase)
                    except:
                        try:
                            test = int(modified_phrase) + float(modified_phrase)
                        except:
                            # if the word could not be converted and 
                            # was not already numerical, ignore it
                            continue

                    if self.probably_range(modified_phrase):
                        return

                    gaps.append((word.label(), orig_phrase, modified_phrase))
                elif word.label() in ["LOCATION", "PROPER"]: 
                    gaps.append((word.label(), orig_phrase, orig_phrase))

        if len(gaps) >= 2 and len(gaps) == len(set(gaps)):
            gaps_filtered = [gap for gap in gaps if gap[0] == 'NUMBER' or gap[0] == 'LOCATION']
            if len(gaps_filtered) and len(gaps) - len(gaps_filtered) > 2:
                self.quiz.add(QuestionSentence(sentence, gaps_filtered))

コード例 #3

0

ファイルを表示

 def get_freq_sequences(self, data_dir):
     big_map = defaultdict(int)
     with open(os.path.join(data_dir, "train.target"),
               'r',
               encoding='utf-8') as f:
         for paragraph in f.readlines():
             words = paragraph.split(' ')
             for i in range(0, len(words) - 2):
                 li = words[i:i + 3]
                 has_num = False
                 for tok in li:
                     num = ''
                     try:
                         num = int(tok)
                     except:
                         try:
                             num = text2num(tok)
                         except:
                             pass
                     if isinstance(num, int):
                         has_num = True
                 if not has_num:
                     current_seq = ' '.join(li)
                     big_map[current_seq] += 1
     tokens = self.tokenizer.batch_encode_plus([
         k for k, v in sorted(
             big_map.items(), key=lambda item: item[1], reverse=True)
     ][:75],
                                               return_tensors='pt')
     self.freq_seq = {
         tuple(x[1:3].tolist()): x[4]
         for x in tokens['input_ids']
     }
     print(self.freq_seq)

コード例 #4

0

ファイルを表示

ファイル: data_utils.py プロジェクト: neulab/ie-eval

def extract_numbers(sent) -> List[NumberSpan]:
    sent_nums = []
    i = 0
    while i < len(sent):
        toke = sent[i]

        if toke.isnumeric():
            sent_nums.append(NumberSpan(i, i + 1, int(toke)))
            i += 1
        elif toke in Keywords.number and not annoying_number_word(
                sent, i):  # get longest span  (this is kind of stupid)
            j = 1
            while (i + j < len(sent) and sent[i + j] in Keywords.number
                   and not annoying_number_word(sent, i + j)):
                j += 1

            # corner cases: "x three - pointers", "eight nine turnovers"
            if j > 1 and sent[i + 2] in ["-", "'s", "turnovers"]:
                j = 1
            try:
                sent_nums.append(
                    NumberSpan(i, i + j, text2num(" ".join(sent[i:i + j]))))
            except NumberException:
                pass
            i += j
        else:
            i += 1
    return sent_nums

コード例 #5

0

ファイルを表示

ファイル: data_extract.py プロジェクト: KonstantinRothe/NFLGameReview

def extract_summary_numbers(words):
    ignores = set([
        "three point", "three - point", "three - pt", "three pt",
        "three - pointers", "three pointers", "three pointer"
    ])
    numbers = []
    idx = 0
    while idx < len(words):
        is_number = False
        try:
            number_value = int(words[idx])
            numbers.append((idx, idx + 1, words[idx], number_value))
            idx += 1
            continue
        except:
            pass
        for end_idx in range(min(idx + 5, len(words)), idx, -1):
            number_string = ' '.join(words[idx:end_idx])
            try:
                number_value = text2num(number_string)
                numbers.append((idx, end_idx, number_string, number_value))
                is_number = True
                idx = end_idx
                break
            except NumberException:
                if number_string in ignores:
                    break
        if not is_number:
            idx += 1
    return numbers

コード例 #6

0

ファイルを表示

def extract_numbers(sent):
    sent_nums = []
    i = 0
    ignores = set(["three point", "three-point", "three-pt", "three pt"])
    #print(sent)
    while i < len(sent):
        toke = sent[i]
        a_number = False
        try:
            itoke = int(toke)
            a_number = True
        except ValueError:
            pass
        if a_number:
            sent_nums.append((i, i+1, int(toke)))
            i += 1
        elif toke in number_words and not annoying_number_word(sent, i): # get longest span  (this is kind of stupid)
            j = 1
            while i + j < len(sent) and sent[i + j] in number_words and not annoying_number_word(sent, i + j):
                j += 1
            try:
                sent_nums.append((i, i+j, text2num(" ".join(sent[i:i+j]))))
            except NumberException:
                pass
                #print(sent)
                #print(sent[i:i+j])
                #assert False
            i += j
        else:
            i += 1
    return sent_nums

コード例 #7

0

ファイルを表示

ファイル: main.py プロジェクト: devintjones/wfp-hackathon

 def convert_string_to_num(response):
     for token in response.split(' '):
         try:
             num_list = text2num(token)
             return num_list
         except NumberException:
             print('number exception when converting text to num')

コード例 #8

0

ファイルを表示

ファイル: mlb_data_utils.py プロジェクト: ratishsp/mlb-data-scripts

def extract_numbers(sent):
    sent_nums = []
    i = 0
    #print sent
    while i < len(sent):
        toke = sent[i]
        a_number = False
        to_evaluate = toke.replace("/", "")  # handle 1/3
        try:
            itoke = float(to_evaluate)
            a_number = True
        except ValueError:
            pass
        if a_number:
            sent_nums.append((i, i + 1, toke))
            i += 1
        elif toke in number_words:  # and not annoying_number_word(sent, i): # get longest span  (this is kind of stupid)
            j = 1
            while i + j < len(sent) and sent[
                    i +
                    j] in number_words:  # and not annoying_number_word(sent, i+j):
                j += 1
            try:
                sent_nums.append((i, i + j, text2num(" ".join(sent[i:i + j]))))
            except NumberException:
                pass
                #print sent
                #print sent[i:i+j]
                #assert False
            i += j
        else:
            i += 1
    return sent_nums

コード例 #9

0

ファイルを表示

def build_sentence_info(timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return:
    '''
    # for test
    # print sentence

    h_en = Hyphenator('en_US')
    info_list = []
    # words = re.split('\W+', sentence)
    words = re.split('[,.!?\r\n ]+', sentence)
    # print words
    # print len(words)
    # print len(timestamps)
    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        info_list.append(
            (word, timestamps[ind * 2], timestamps[ind * 2 + 1],
             len(h_en.syllables(unicode(word))), c_sentiment, punct, num))
    return info_list

コード例 #10

0

ファイルを表示

ファイル: math_expression_calculator.py プロジェクト: AnbuKumar-maker/Jarvis-AI-on-Jetson-Nano

def squareroot(x):
    try:
        x = float(x)
    except:
        x = float(t2n.text2num(x))
    if 0 <= x:
        return x**(1. / 2.)
    return -(-x)**(1. / 2.)

コード例 #11

0

ファイルを表示

ファイル: math_expression_calculator.py プロジェクト: AnbuKumar-maker/Jarvis-AI-on-Jetson-Nano

def cuberoot(x):
    try:
        x = float(x)
    except:
        x = float(t2n.text2num(x))
    if 0 <= x:
        return x**(1. / 3.)
    return -(-x)**(1. / 3.)

コード例 #12

0

ファイルを表示

def text_normalize(string, convert2digit=True):
    text = preprocess_text(text=string, fix_unicode=False, lowercase=True, transliterate=False,
                           no_urls=True, no_emails=True, no_phone_numbers=True,
                           no_numbers=True, no_currency_symbols=True, no_punct=False,
                           no_contractions=True, no_accents=True, not_hashtag=True)
    if convert2digit:
        return text2num(text)
    else:
        return text

コード例 #13

0

ファイルを表示

ファイル: SEC.py プロジェクト: jasonleinbach-wf/Arelle-1

def numwordsen(arg):
    if not numwordsenPattern.match(arg) or len(arg) == 0:
        raise FunctionArgType(1, "numwordsen lexical error")
    elif numwordsNoPattern.match(arg): # match "no" or "none"
        return "0"
    try:
        return str(text2num(commaAndPattern.sub(" ", arg.strip().lower()))) # must be returned as a string
    except (NumberException, TypeError, ValueError) as ex:
        raise FunctionArgType(1, str(ex))

コード例 #14

0

ファイルを表示

ファイル: non_rg_metrics.py プロジェクト: ha-lins/DTG-SI

 def process_item(item):
     try:
         item[0] = int(item[0])
     except ValueError:
         try:
             item[0] = text2num(item[0])
         except NumberException:
             pass
     return Item(*item)

コード例 #15

0

ファイルを表示

ファイル: parse_stories.py プロジェクト: mprelee/data-incubator-capstone

def parse_ages(stories) :
    ''' Search for age information.
    stories is a list of 'story' strings

    returns a list of ages, same length as stories

    Newborns are considered to have age zero
    All non-year ages are quantized by taking a floor of the value i.e. 8 months is
    zero, 14 months is age 1, etc.

    'few','a' are considered to be 1 unit
    '''
    ages = []
    count = 0
    for story in stories :
        age_match = age_regex.search(story)
        newborn_match = newborn_regex.search(story)
        if age_match is not None :
            age_str = age_match.groups()[0]
            # Parse unit as ascii and make lowercase
            unit = age_match.groups()[1].encode('ascii','replace').lower()
            try :
                age_unitless = int(age_str)
            except ValueError :
                try :
                    age_unitless = text2num(age_str.encode('ascii','replace').lower())
                except :
                    if age_str in AGE_STRINGS_AS_ONE:
                        age_unitless = 1
                    else :
                        # If problem parsing, assume it is a small number
                        print 'Error parsing \'%s\' into a number, converting to zero' % age_str
                        age_unitless = 0
                        print story
            count+=1
            if unit in ['year','years'] :
                age = age_unitless
            elif unit in ['month','months'] :
                age = floor(float(age_unitless)/12.)
                #print '%d months converted to %d years' % (age_unitless,age)
            elif unit in ['week','weeks'] :
                age = floor(float(age_unitless)/52.)
                #print '%d weeks converted to %d years' % (age_unitless,age)
            elif unit in ['day','days'] :
                age = floor(float(age_unitless)/365.)
                #print '%d days converted to %d years' % (age_unitless,age)
        elif newborn_match is not None :
            age = 0
            count+=1
        else :
            age = None
            #print story
        ages.append(age)
    print 'Identified %d ages out of %d patients' % (count,len(stories))
    return ages

コード例 #16

0

ファイルを表示

ファイル: clean.py プロジェクト: phymucs/rw_fg

def int_value(input):
    is_number = False
    try:
        value = int(input)
        is_number = True
    except ValueError:
        pass

    if not is_number:
        value = text2num(input)
    return value

コード例 #17

0

ファイルを表示

ファイル: atoi.py プロジェクト: stensonowen/factorial_bot

def extract(text):
    tokens = prep(text)
    first = len(tokens)
    for i in range(len(tokens))[::-1]:
        #print(tokens[i] + " in valid: " + str(tokens[i] in valid_tokens))
        if tokens[i] in valid_tokens:
            first = i
        else:
            break
    #print('_'.join(tokens[first:]))
    return text2num.text2num(tokens[first:])

コード例 #18

0

ファイルを表示

ファイル: run_tests.py プロジェクト: quankiquanki/text2num

 def test_function(self):
     str_in = "No numbers in this sentence."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)
     assert(str_out == "No numbers in this sentence.") 
             
     str_in = "I have eighty one apples."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)
     assert(str_out == "I have 81 apples.")
 
     str_in = "TWO HUNDRED THOUSAND DOLLARS IS WHAT YOU OWE ME!"
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "200000 DOLLARS IS WHAT YOU OWE ME!")
     
     str_in = "The number here is ten thousand three hundred forty one"
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "The number here is 10341")        
 
     str_in = "There are six billion and five hundred fifty four million and nine hundred eleven thousand and three hundred twenty one people."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "There are 6554911321 people.")
     
     str_in = "Kenneth will get twenty two hundred apples while Keith gets sixty seven."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "Kenneth will get 2200 apples while Keith gets 67.")
     
     str_in = "I have seventeen cars, three hundred twenty seven servants, five thousand houses and two million and twenty three hundred dollars."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "I have 17 cars, 327 servants, 5000 houses and 2002300 dollars.")        
     
     str_in = "Two hundred and eighty two melons are laying in that box."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "282 melons are laying in that box.")
     
     str_in = "This is a hundred hundred and a thousand thousand."
     str_out = text2num(str_in)
     print 'Input:\t%s\nOutput:\t%s\n' % (str_in, str_out)      
     assert(str_out == "This is a 100 100 and a 1000 1000.")

コード例 #19

0

ファイルを表示

    def test_get_a_story(self):
        numservice = NumberService()
        result = numservice.parse("11")

        print(result)

        self.assertEqual(1, text2num("one"))
        self.assertEqual(12, text2num("twelve"))
        self.assertEqual(72, text2num("seventy two"))
        self.assertEqual(300, text2num("three hundred"))
        self.assertEqual(1200, text2num("twelve hundred"))
        self.assertEqual(12304, text2num("twelve thousand three hundred four"))
        self.assertEqual(6000000, text2num("six million"))
        self.assertEqual(6400005,
                         text2num("six million four hundred thousand five"))
        self.assertEqual(
            123456789012,
            text2num(
                "one hundred twenty three billion four hundred fifty six million seven hundred eighty nine thousand twelve"
            ))
        self.assertEqual(4000000000000000000000000000000000,
                         text2num("four decillion"))

コード例 #20

0

ファイルを表示

def save_rep():
    message_body = request.form['Body']
    phone_number = request.form['From']
    write = True
    df = pd.read_csv(path)

    if all([df.ix[i, 'tel'] != int(phone_number) for i in range(df.shape[0])]):
        try:
            message_body = int(message_body)
        except:
            message_body = message_body.lower().strip()
            if message_body in Small:
                message_body = text2num(message_body)
            elif message_body == 'chut':
                message_body = -1  #sursollicitation
            else:
                #message auto je n'ai pas compris votre reponse (redemander noter + STOP)
                print('message incompris')
                write = False
                resp = unknow_sms()

        if write:
            d = {
                'tel': [phone_number],
                'rep1': [message_body],
                'rep2': [0],
                'conversation': [message_body]
            }
            tmp = pd.DataFrame(d)
            df = df.append(tmp)
            df.to_csv(path, index=False)

            if message_body < 10 and message_body != 1:
                body = irc_q2()
                resp = MessagingResponse()
                resp.message(body)
            else:
                resp = rep_default()
    else:
        df.ix[df['tel'] == int(phone_number), 'rep2'] = message_body
        df.ix[df['tel'] == int(phone_number), 'conversation'] = str(
            df.ix[df['tel'] == int(phone_number),
                  'conversation'][0]) + '/' + str(message_body)
        df.to_csv(path, index=False)
        resp = rep_default()

    return resp

コード例 #21

0

ファイルを表示

ファイル: SEC.py プロジェクト: jasonleinbach-wf/Arelle-1

def durwordsen(arg):
    durWordsMatch = durwordsenPattern.match(arg)
    if not durWordsMatch or len(arg.strip()) == 0:
        raise FunctionArgType(1, "durwordsen lexical error")
    try:
        dur = 'P'
        durWordsMatchGroups = durWordsMatch.groups()
        for groupIndex, groupSuffix in ((1,"Y"), (61,"M"), (121, "D")):
            groupPart = durWordsMatchGroups[groupIndex]
            if groupPart and not durwordZeroNoPattern.match(groupPart):
                if groupPart.isnumeric():
                    dur += groupPart + groupSuffix
                else:
                    dur += str(text2num(commaAndPattern.sub(" ", groupPart.strip().lower()))) + groupSuffix
        return dur if len(dur) > 1 else "P0D" # must have at least one number and designator
    except (NumberException, TypeError, ValueError) as ex:
        raise FunctionArgType(1, str(ex))

コード例 #22

0

ファイルを表示

ファイル: calc.py プロジェクト: swapniljn8/python-basic-calculator

def normalize_numbers(line):
    '''
		Gets a simple string and tries to match with certain aliases
		if numbers has been passed as English word

		Argument:
		line -- string we want to normalize

		Example:
			@input: "one + three - four"
			@output: "1 + 3 - 4"
	'''

    result = ""

    #-- If line starts with (-) or (+) we keep this for the result --
    if line[0] in ['+', '-']:
        result = line[0]
        line = line[1:]

    subline = line
    list_operations = []
    while next_operator(subline):
        operation = next_operator(subline)
        position = subline.index(operation)
        subline = subline[position + 1:]
        list_operations.append(operation)

    subline = line
    for operation in operator_alias:
        subline = subline.replace(operation, '#')
    list_words = subline.split('#')

    for position in range(len(list_words)):
        word = list_words[position].strip()
        try:
            int(word)
        except ValueError:
            word = str(text2num.text2num(word))

        result += word
        if position < len(list_operations):
            result += list_operations[position]

    return result

コード例 #23

0

ファイルを表示

ファイル: calc.py プロジェクト: JoseReisinho/python-basic-calculator

def normalize_numbers(line):
	'''
		Gets a simple string and tries to match with certain aliases
		if numbers has been passed as English word

		Argument:
		line -- string we want to normalize

		Example:
			@input: "one + three - four"
			@output: "1 + 3 - 4"
	'''

	result = ""

	#-- If line starts with (-) or (+) we keep this for the result --
	if line[0] in ['+','-']:
		result = line[0]
		line = line[1:]

	subline = line
	list_operations = []	
	while next_operator(subline):
		operation = next_operator(subline)
		position = subline.index(operation)
		subline = subline[position+1:]
		list_operations.append(operation)

	subline = line
	for operation in operator_alias:
		subline = subline.replace(operation, '#')
	list_words = subline.split('#')
	
	for position in range(len(list_words)):
		word = list_words[position].strip()
		try: 
			int(word)
		except ValueError:
			word = str(text2num.text2num(word))
		
		result += word 
		if position < len(list_operations):
			result += list_operations[position]

	return result

コード例 #24

0

ファイルを表示

def extract_chunks(chunked, tags):
    exp = ""
    digit = ""
    for subtree in chunked.subtrees(filter=lambda t: t.label() in tags):
        for l in subtree.leaves():
            print("l[0] -->>> ", str(l[0]))
            if str(l[0]) not in ["+", "-", "*", "/", "x", "X", "plus", "minus", "multiplied", "divided"]:
                digit += str(l[0]) + " "
            else:
                try:
                    digit = str(t2n.text2num(digit[:-1]))
                    digit += " " + str(l[0])
                    exp += " " + digit
                    digit = ""
                except Exception as e:
                    print("text2num error ->", e.args)
    if len(digit) > 0:
        exp += " " + digit
    return exp

コード例 #25

0

ファイルを表示

ファイル: agent.py プロジェクト: sampr0/nightfury

 def _get_words(phrase):
     phrase = phrase.lower()
     t = parsetree(phrase)
     words = []
     for s in t:
         for chunk in s.chunks:
             if chunk.type == 'NP':
                 for w in chunk.words:
                     if w.type == "CD":
                         try:
                             int(w.string)
                             words.append(w.string)
                         except ValueError:
                             try:
                                 words.append(text2num.text2num(w.string))
                             except text2num.NumberException:
                                 pass
                     elif w.type == "NN":
                         words.append(w.string.lower())
     return ([unicode(w) for w in words])

コード例 #26

0

ファイルを表示

ファイル: d2v.py プロジェクト: tunnelshade/nightfury

 def _get_words(phrase):
     phrase = phrase.lower()
     t = parsetree(phrase)
     words = []
     for s in t:
         for chunk in s.chunks:
             if chunk.type == 'NP':
                 for w in chunk.words:
                     if w.type == "CD":
                         try:
                             int(w.string)
                             words.append(w.string)
                         except ValueError:
                             try:
                                 words.append(text2num.text2num(w.string))
                             except text2num.NumberException:
                                 pass
                     elif w.type == "NN":
                         words.append(w.string.lower())
     return([unicode(w) for w in words])

コード例 #27

0

ファイルを表示

def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print(text)
    return text

コード例 #28

0

ファイルを表示

def evaluate(expression):
    BNF.expr_stack = []
    expression, formatted_expression = text2num(expression)

    expression = expression.replace(u"\u00D7", "*")  # X symbol
    expression = expression.replace(u"\u03c0", "PI")  # Greek PI symbol

    # print "CONVERTED:", expression, formatted_expression
    res = BNF.get_bnf().parseString(expression)
    # print "STACK:", BNF.expr_stack[:]
    # print "RES: ", res, "STACK: ", BNF.expr_stack
    val, expr = evaluateStack(BNF.expr_stack[:])
    #expr = " ".join(res)
    # print expr

    if isinstance(val, datetime.timedelta):
        val = "%d days %d hours %d minutes" % (
            val.days,  # IGNORE:E1103
            val.seconds // 3600,
            val.seconds % 3600 // 60)

    return val, expr

コード例 #29

0

ファイルを表示

ファイル: Utilities.py プロジェクト: Juncai/SpeechTextLabeler

def build_sentence_info(timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return:
    '''
    # for test
    # print sentence


    h_en = Hyphenator('en_US')
    info_list = []
    # words = re.split('\W+', sentence)
    words = re.split('[,.!?\r\n ]+', sentence)
    # print words
    # print len(words)
    # print len(timestamps)
    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        info_list.append((word,
                          timestamps[ind * 2],
                          timestamps[ind * 2 + 1],
                          len(h_en.syllables(unicode(word))),
                          c_sentiment,
                          punct,
                          num))
    return info_list

コード例 #30

0

ファイルを表示

def build_sentence_data(title, timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return: a SentenceData object contain text-based information about the sentence
    '''
    # for test
    # print sentence

    s = SentenceData(title, sentence)
    s.words = []

    h_en = Hyphenator('en_US')
    words = re.split('[,.!?\r\n ]+', sentence)

    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        if num == -1:
            num = ''
        else:
            num = str(num)
        w = WordData(word, float(timestamps[ind * 2]),
                     float(timestamps[ind * 2 + 1]), c_sentiment,
                     len(h_en.syllables(unicode(word))), punct, num)
        s.words.append(w)
    return s

コード例 #31

0

ファイルを表示

ファイル: skilltester.py プロジェクト: kevinross/hvsi

	def check(self, ans):
		if self.wo:
			try:
				ans = text2num.text2num(str(ans))
			except:
				return False
		else:
			if self.op in SkillTestingQuestion.arith:
				try:
					ans = int(ans)
				except:
					return False
			else:
				try:
					if ans.lower() in ('true','yes','oui'):
						ans = True
					elif ans.lower() in ('false','no','non'):
						ans = False
					else:
						return False
				except:
					return False
		return ans == self.ans

コード例 #32

0

ファイルを表示

ファイル: UDE.py プロジェクト: Maxgermany/BachelorFork

def extract_summary_numbers(sent_words, ignore_numbers=''):
    '''
    Extract textual numbers in the text but ignoring certain keywords (like "three pointers", "Elfmeter")
    Returns a list of tuples, which are composed of the start and end position of text numbers, the string
    that is the number and the value of the number. 

    TODO:
    ++ What happens to non-int numbers? 
    '''
    ignores = []
    numbers = []
    idx = 0
    #try to parse string numbers as int ("2" -> 2)
    while idx < len(sent_words):
        is_number = False
        try:
            number_value = int(sent_words[idx])
            numbers.append((idx, idx + 1, sent_words[idx], number_value))
            idx += 1
            continue
        except:
            pass
        # try to parse written numbers to ints ("Two" -> 2)
        for end_idx in range(min(idx + 5, len(sent_words)), idx, -1):
            number_string = ' '.join(sent_words[idx:end_idx])
            try:
                number_value = text2num(number_string)
                numbers.append((idx, end_idx, number_string, number_value))
                is_number = True
                idx = end_idx
                break
            except NumberException:
                if number_string in ignores:
                    break
        if not is_number:
            idx += 1
    return numbers

コード例 #33

0

ファイルを表示

ファイル: Utilities.py プロジェクト: Juncai/SpeechTextLabeler

def build_sentence_data(title, timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return: a SentenceData object contain text-based information about the sentence
    '''
    # for test
    # print sentence

    s = SentenceData(title, sentence)
    s.words = []

    h_en = Hyphenator('en_US')
    words = re.split('[,.!?\r\n ]+', sentence)

    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        if num == -1:
            num = ''
        else:
            num = str(num)
        w = WordData(word, float(timestamps[ind * 2]), float(timestamps[ind * 2 + 1]), c_sentiment,
                     len(h_en.syllables(unicode(word))), punct, num)
        s.words.append(w)
    return s

コード例 #34

0

ファイルを表示

ファイル: math_expression_calculator.py プロジェクト: AnbuKumar-maker/Jarvis-AI-on-Jetson-Nano

def extract_direct_math_expressions(tags):
    exp = ""
    stack = []
    counter = 0
    isSubtract = False
    isSubtracted = False

    for word in tags:
        skip = False
        if "add" == word[0]:
            stack.append(" + ")
        elif "subtract" == word[0]:
            stack.append(" - ")
            isSubtract = True
        elif "multiply" == word[0]:
            stack.append(" * ")
        elif "divide" == word[0]:
            stack.append(" / ")
        elif "plus" == word[0] or "+" == word[0] or "added" == word[0]:
            exp += " + "
        elif "minus" == word[0] or "-" == word[0]:
            exp += " - "
        elif "multiplied" == word[0] or "*" == word[0] or "x" == word[
                0] or "X" == word[0]:
            exp += " * "
        elif "divided" == word[0] or "/" == word[0]:
            exp += " / "
        elif "subtracted" == word[0]:
            exp += " - "
            # isSubtracted = True
            return str(eval("abc"))

        if word[1] == "CD" and word[0] not in ["*", "x", "X", "/", "+", "-"]:
            if isSubtract and len(stack) != 2:
                try:
                    stack.append(str(t2n.text2num(str(word[0]))))
                except:
                    stack.append(word[0])
                skip = True
            # elif isSubtracted:

            else:
                try:
                    exp += str(t2n.text2num(str(word[0])))
                except:
                    exp += str(word[0])

        # to check word numbers that are tagged as non 'CD' .... this is the issue with NLTK
        elif word[0] not in ["*", "x", "X", "/", "+", "-"]:
            if isSubtract and len(stack) != 2:
                try:
                    stack.append(str(t2n.text2num(str(word[0]))))
                except:
                    print("")
                skip = True
            else:
                try:
                    exp += str(t2n.text2num(str(word[0])))
                except:
                    print("")

        if counter > 0 and len(stack) > 0 and not skip:
            if isSubtract:
                stack.reverse()
                exp += stack.pop()
                exp += stack.pop()
                isSubtract = False
            else:
                exp += stack.pop()
        if word[0] in [
                "*", "x", "X", "/", "+", "-", "add", "subtract", "multiply",
                "divide", "added", "subtracted", "multiplied", "divided"
        ]:
            counter += 1

    print("exp 2 -> ", exp)
    return str(eval(exp))

コード例 #35

0

ファイルを表示

ファイル: ruleLinking.py プロジェクト: bethard/timenorm

def process_doc(doc):
    for xmlfile in os.listdir(path + '/' + doc):
        axml = etree.parse(path + '/' + doc + '/' + xmlfile)
        rawfile = open(os.path.join(rawpath,  doc ), 'r')
        text = rawfile.read()
        rawfile.close()

        dctfile = open(os.path.join(dctpath, doc, doc + ".dct"), 'r')
        dct = dctfile.read().rstrip()
        dctfile.close()
        try:
            dct = dprs.parse(dct)
            dctDayofWeek = dct.strftime('%A')
        except ValueError:
            dctDayofWeek = ""

        entities = dict()
        starts = dict()
        for entity in axml.findall('.//entity'):
            eid = entity.find('./id').text
            estart, eend = map(int, entity.find('./span').text.split(','))
            etype = entity.find('./type').text
            eparentsType = entity.find('./parentsType')
            if eparentsType is not None:
                eparentsType = eparentsType.text
            else:
                eparentsType = tnschema[etype]["parentsType"]
                parentsType = etree.Element("parentsType")
                parentsType.text = eparentsType
                entity.append(parentsType)
            eproperties = entity.find('./properties')
            # Empty all links
            if eproperties is not None:
                for prop in eproperties.findall('./*'):
                    eproperties.remove(prop)
            else:
                prop = etree.Element("properties")
                entity.append(prop)
            if estart not in starts:
                starts[estart] = list()
            ent_values = (eid, estart, eend, etype, eparentsType)
            starts[estart].append(eid)
            entities[eid] = ent_values

        links = dict()
        stack = list()
        entity_list = dict()
        lend = -1
        for start in sorted(starts):
            for entity in starts[start]:
                (eid, estart, eend, etype, eparentsType) = entities[entity]
                if estart - lend > 10 and lend > -1:
                    stack = list()
                    entity_list = dict()
                lend = eend
                entity_list[eid] = (estart, eend, etype, eparentsType)
                ltype = ""
                stack_pointer = list()
                stack_pointer.extend(stack)
                while len(stack_pointer) > 0:
                    s = stack_pointer.pop()
                    stype = entity_list[s][2]
                    ltype = get_relation(tnschema, etype, stype)
                    if ltype != '':
                        if eid not in links:
                            links[eid] = dict()
                        if ltype not in links[eid]:
                            links[eid][ltype] = list()
                            links[eid][ltype].append(s)
                    else:
                        ltype = get_relation(tnschema, stype, etype)
                        if ltype != '':
                            if s not in links:
                                links[s] = dict()
                            if ltype not in links[s]:
                                links[s][ltype] = list()
                                links[s][ltype].append(eid)
                stack.append(eid)


        for entity in axml.findall('.//entity'):
            eid = entity.find('./id').text
            etype = entity.find('./type').text
            estart, eend = map(int, entity.find('./span').text.split(','))
            eproperties = entity.find('./properties')
            if etype in tnschema:
                for relation in tnschema[etype]:
                    if relation != "parentsType":
                        span = "".join(text[estart:eend])
                        if relation == "Type":
                            ptype = span.title()
                            if ptype == "About":
                                ptype = "Approx"
                            if etype in types:
                                if span in types[etype]:
                                    ptype = types[etype][span]
                            if etype == "Calendar-Interval" and ptype != "Unknown":
                                if ptype.endswith("s"):
                                    ptype = ptype[:-1]
                            elif etype == "Period" and ptype != "Unknown":
                                if not ptype.endswith("s"):
                                    ptype += "s"
                            ty = etree.Element(relation)
                            ty.text = ptype
                            eproperties.append(ty)
                        elif relation == "Value":
                            val = etree.Element(relation)
                            span = re.sub(r'^0(\d)', r'\1', re.sub(r'^0+', '0', span))
                            span = str(text2num.text2num(span))
                            val.text = span
                            eproperties.append(val)
                        elif re.search('Interval-Type',relation):
                            intervalemtpy = True
                            if eid in links:
                                if "Interval" in links[eid]:
                                    if links[eid]["Interval"] != "":
                                        intervalemtpy = False
                            if not intervalemtpy:
                                itype = etree.Element(relation)
                                itype.text = "Link"
                                eproperties.append(itype)
                            else:
                                itype = etree.Element(relation)
                                itype.text = "DocTime"
                                eproperties.append(itype)
                        elif relation == "Semantics":
                            sem = etree.Element(relation)
                            sem.text = "Interval-Not-Included"
                            eproperties.append(sem)
                        else:
                            notnull = False
                            if eid in links:
                                if relation in links[eid]:
                                    for child in links[eid][relation]:
                                        si = etree.Element(relation)
                                        si.text = child
                                        eproperties.append(si)
                                        notnull = True
                            if tnschema[etype][relation][0] and not notnull:
                                if eproperties.find('./' + relation) is None:
                                    si = etree.Element(relation)
                                    eproperties.append(si)
                if etype == "Last":
                    semantics = eproperties.findall('./Semantics')[0]
                    interval_included = "Interval-Not-Included"
                    for repint in eproperties.findall('./Repeating-Interval'):
                        if repint.text is not None:
                            (rid, rstart, rend, rtype, rparentsType) = entities[repint.text]
                            rspan = "".join(text[int(rstart):int(rend)])
                            if rspan.title() == dctDayofWeek:
                                interval_included = "Interval-Included"
                    semantics.text = interval_included

        if not os.path.exists(out_path + '/' + doc):
            os.makedirs(out_path + '/' + doc)
        axml.write(out_path + '/' + doc + '/' + xmlfile, pretty_print=True)

コード例 #36

0

ファイルを表示

ファイル: morefields_summaries_mongo.py プロジェクト: cgerson/ufo-sightings-analysis

def nums(x):
    try:
        result = text2num.text2num(x)
    except:
        result = x
    return result

コード例 #37

0

ファイルを表示

            all_ents, players, teams, cities, total_players, total_teams, total_cities = get_ents(entry)
            box_score = entry["box_score"]
            player_name_map = {y: x for x, y in box_score['PLAYER_NAME'].items()}
            home_line_score = entry["home_line"]
            vis_line_score = entry["vis_line"]
            summary = entry['summary']
            instance_count += 1

        else:
            curr.append(line.strip())
            args = line.split("|")
            name = args[0]
            record_type = args[2].strip()
            value = args[1]
            if not value.isdigit():
                value = text2num(value)
            else:
                value = int(value)
            if record_type.startswith("PLAYER-"):
                record_type = record_type[len("PLAYER-"):]

            name = name.replace("UNK", "").strip()
            if name == 'Los Angeles' and 'LA' in total_cities:
                name = 'LA'
            if name in total_players:
                pass
            elif name in total_teams:
                pass
            elif name in players:
                name = resolve_name(name, total_players)
            elif name == 'Los Angeles Clippers' and 'LA Clippers' in total_teams:

コード例 #38

0

ファイルを表示

def remUnits(text):
	if(text == ' ' or text.lower() == 'unknown' or text.lower() == 'to be determined'):
		return 'N/A'
	if('million' in text.lower()):
		pieces = text.replace('~', '').split()
		idx = pieces.index('million') - 1
		million_mult = 1000000
		try:
			return int(pieces[idx]) * million_mult
		except ValueError:
			return int(float(pieces[idx]) * million_mult)
	if('=' in text.lower() or 'version' in text.lower() or 'total' in text.lower() or '(' in text.lower() or '/' in text.lower() or ':' in text.lower() or 'of' in text.lower() or 'each' in text.lower() or 'per' in text.lower() or 'in' in text.lower() or '--' in text.lower()):
		temt = text.replace(',', '').replace('-', '').replace('/', ' / ')#text.replace('(', '').replace(')', '').replace('/', ' / ').replace(',', '')
		pieces = temt.split()
		nams = []
		for guess in pieces:
			try:
				frag = int(guess)
				nams.append(frag)
				continue
			except ValueError:
				pass
			try:
				frag = float(guess)
				nams.append(frag)
				continue
			except ValueError:
				continue
		if(nams != []):
			if('=' in text.lower()):
				return nams[len(nams)-1]
			if('version' in text.lower()):
				return nams[0]
			if('/' in text.lower() or 'of' in text.lower() or 'each' in text.lower() or 'per' in text.lower()):
				try:
					return nams[0]*nams[1]
				except IndexError:
					return nams[0]
			elif('total' in text.lower() or '--' in text.lower() or '(' in text.lower()):
				return nams[0]
			elif(':' in text.lower() or 'in' in text.lower()):
				return sum(nams)
	text = re.sub(r'\([^)]*\)', '', text).replace(', ', ',')
	runningString = ''
	bool = True
	lastChar = ''
	for x in list(text):
		if(lastChar == 'Z'):
			if(x == '-'):
				break
		if x.isdigit() or x == ',' or x == '.':
			runningString += x
			bool = True
		else:
			if bool:
				runningString += ' '
				bool = False
		lastChar = x
	#split by spaces
	pieces = runningString.split(' ')
	tot = 0
	#look through each piece
	check = False
	for fragment in pieces:
		#first attempt is if in format of 'US = 500'
		if('=' in fragment):
			try:
				fragment = fragment.split('=')[1]
				grab = int(fragment)
				tot=tot+grab
				check = True
				continue
			except ValueError:
				pass
		if('.' in fragment):
			try:
				tot = tot+float(fragment)
				check = True
				continue
			except ValueError:
				pass
		if(',' in fragment):
			try:
				tot = tot+int(''.join(fragment.split(',')))
				check = True
				continue
			except ValueError:
				pass
		try:
			tot = tot+int(fragment)
			check = True
		except ValueError:
			continue
	if(check == True):
		return tot
	flag = False
	if(tot == 0):
		pieces2 = text.lower()
		pieces2 = pieces2.split(' ')
		for fragment in pieces2:
			try:
				tot = tot+text2num(fragment)
				flag = True
			except ValueError:
				pass
		if(flag and tot == 0):
			return 0
		if(tot != 0):
			return tot
		if(tot == 0):
			return 'N/A'
	return tot

コード例 #39

0

ファイルを表示

ファイル: math_expression_calculator.py プロジェクト: AnbuKumar-maker/Jarvis-AI-on-Jetson-Nano

def square(x):
    try:
        return float(x)**2
    except:
        return float(t2n.text2num(x))**2

コード例 #40

0

ファイルを表示

ファイル: process_nypdbudget.py プロジェクト: NYPDVisionZeroAccountability/compstat-vs-moving-violation-enforcement

import csv
import json
from text2num import text2num
import StringIO

if __name__ == '__main__':
    with open('rawdata/police_budget_fy14.csv') as f:
        police_data = f.read()

    budget_datas = []
    reader = csv.reader(StringIO.StringIO(police_data))
    for precinct, val in reader:
        try:
            precinct = precinct.lower().replace("precinct", "").replace(
                "precint", "").replace("precinc", "").strip()
            precinct_id = text2num(precinct)
            for m in range(1, 13):
                budget_datas.append({
                    "month": m,
                    "precinct": precinct_id,
                    "total": int(val) / 12,
                    "type": "budget",
                    "year": 2014
                })
        except:
            # TODO: for now, skip everything else, we are skipping about 7
            pass

    print(json.dumps(budget_datas))

コード例 #41

0

ファイルを表示

ファイル: takeorder.py プロジェクト: ThunderClapper/ultimate-drive-thru

def takeorderfunction():

    finalod = []
    print(
        "INSTRUCTIONS: \n 1) Be clear \n 2) Mention Quantity, even for suborders \n 3) Avoid Repeating name for suborder"
    )

    engine.say("hi. whats your name?")
    engine.runAndWait()
    while True:
        with sr.Microphone(
        ) as source:  # use the default microphone as the audio source
            name = recog.listen(source)  #recognise name
        try:
            name = str(recog.recognize_google(name))
            break
        except sr.UnknownValueError:
            engine.say("Oops! Didn't catch that")
            engine.runAndWait()
            continue
        except sr.RequestError as e:
            mess = "Sorry Service is Unavailible at the moment"
            return mess

    engine.say("what would you like to eat?")
    engine.runAndWait()
    while True:
        with sr.Microphone(
        ) as source:  # use the default microphone as the audio source
            order = recog.listen(source)  # recognise order
        try:
            order = recog.recognize_google(order)
            print("You said " +
                  order)  # recognize speech using Google Speech Recognition'''
            #order=input()                  #remove after testing
            break
        except sr.UnknownValueError:
            engine.say("Oops! Didn't catch that")
            engine.runAndWait()
            continue
        except sr.RequestError as e:
            mess = "Sorry Service is Unavailible at the moment"
            return mess

    order = order.lower()
    for word in makelist.spacethings:
        order = order.replace(word, word.replace(" ", "_"))
    order = order.replace(" a ", " 1 ").replace(" a ", " 1 ")
    order = order.split()
    for k, v in enumerate(order):
        order[k] = str(text2num.text2num(str(v).lower()))
    #print(order)          #remove after testing
    mess = "You said " + " ".join(order)
    #print(finalod)
    finalod.extend(makelist.makeorder(order))
    print("before anyhting more", finalod)

    while True:
        engine.say("Anything more?")
        engine.runAndWait()

        while True:
            with sr.Microphone(
            ) as source:  # use the default microphone as the audio source
                add = recog.listen(source)  # recognise confirmtion
            try:
                add = str(recog.recognize_google(add))
                if add == "yes" or add == "no":
                    break
                else:
                    engine.say("please answer with yes or no. Anything more?")
                    engine.runAndWait()
                    continue
            except sr.UnknownValueError:
                engine.say("Oops! Didn't catch that")
                engine.runAndWait()
                continue
            except sr.RequestError as e:
                mess = "Sorry Service is Unavailable at the moment"
                return mess

        if add == "yes":
            engine.say("what would you like to add to your order")
            engine.runAndWait()
            while True:
                with sr.Microphone(
                ) as source:  # use the default microphone as the audio source
                    aorder = recog.listen(source)  # recognise order
                try:
                    aorder = str(recog.recognize_google(aorder))
                    print(
                        "You said " + aorder
                    )  # recognize speech using Google Speech Recognition'''
                    # order=input()                  #remove after testing
                    break
                except sr.UnknownValueError:
                    engine.say("Oops! Didn't catch that. Please try again.")
                    engine.runAndWait()
                    continue
                except sr.RequestError as e:
                    mess = "Sorry Service is Unavailable at the moment"
                    return mess
            aorder = aorder.lower()
            for word in makelist.spacethings:
                aorder = aorder.replace(word, word.replace(" ", "_"))
            aorder = aorder.replace(" a ", " 1 ").replace(" a ", " 1 ")
            aorder = aorder.split()
            for k, v in enumerate(aorder):
                aorder[k] = str(text2num.text2num(str(v).lower()))
            # print(order)          #remove after testing
            mess = mess + " + " + " ".join(aorder)
            finalod.extend(makelist.makeorder(aorder))
        if add == "no":
            break

    return finalod, mess, name

コード例 #42

0

ファイルを表示

ファイル: atoi.py プロジェクト: stensonowen/factorial_bot

def atoi(text):
    tokens = prep(text)
    result = text2num.text2num(tokens)
    return result

コード例 #43

0

ファイルを表示

ファイル: clean.py プロジェクト: phymucs/rw_fg

def fix_tokenization(s):
    global full_name_cnt
    mwe_file = "/home/hongmin_wang/table2text_nlg/harvardnlp/data2text-harvard/mwes.json"
    with io.open(mwe_file, 'r', encoding='utf-8') as fmwe:
        tmp = json.load(fmwe)
        mwes = {k: v for k, v in tmp.items() if v > 1}
    full_names = {' '.join(k.split('_')): k for k, _ in mwes.items()}

    clean = []

    for k, v in full_names.items():
        if k in s:
            full_name_cnt += 1
            s = s.replace(k, v)

    for w in s.split():
        if w.endswith("s’"):
            w = ' '.join([w[:-1], "'"])
        elif w.endswith("’s"):
            w = ' '.join([w[:-2], "'s"])

        if re.search(p1, w):
            components = w.split('.')
            if len(components) == 2:
                print("Original {}".format(w))
                w = ' . '.join(components)
                print("changed to {}".format(w))
            if w.endswith('..'):
                print("Original {}".format(w))
                w = '{} .'.format(components[0])
                print("changed to {}".format(w))

        if re.search(p2, w):
            print("Original {}".format(w))
            num, suffix = re.findall(p2, w)[0]
            w = ' '.join([num, suffix])
            print("changed to {}".format(w))

        # fix tokenization errors caused by commas
        if re.search(p3, w):
            print("Original {}".format(w))
            w = ''.join(w.split(','))
            print("changed to {}".format(w))

        if re.search(p4, w):
            print("Original {}".format(w))
            w = ' , '.join(w.split(','))
            print("changed to {}".format(w))

        if re.search(p5, w):
            print("Original {}".format(w))
            w = ' - '.join(w.split('-'))
            print("changed to {}".format(w))

        if re.search(p6, w):
            print("Original {}".format(w))
            pieces = re.findall(p6, w)[0]
            try:
                pieces[0] = text2num(pieces[0])
            except:
                pass
            w = ' '.join(pieces)
            print("changed to {}".format(w))

        if re.search(p7, w):
            pre = re.findall(p7, w)[0]
            print("Original {}".format(w))
            w = ' '.join([pre, 'two_point'])
            print("changed to {}".format(w))

        clean.append(w.strip())

    result = ' '.join(clean)
    for k, v in post_fixes.items():
        result = result.replace(k, v)

    return result

コード例 #44

0

ファイルを表示

ファイル: help2vec.py プロジェクト: replive/nightfury

def input_help_to_vec(p):
    t = parsetree(p)
    requirements = []
    mandatory = False
    # pprint(t)
    for sen in t:
        for i, chunk in enumerate(sen.chunks):
            if chunk.type == "ADJP":
                vector = copy.deepcopy(INPUT_VECTOR)
                for w in chunk.words:
                    if w.type.startswith("JJ") and mandatory_similarity(w.string) > 0.9:
                        mandatory = True
            if chunk.type == "NP":
                vector = copy.deepcopy(INPUT_VECTOR)
                adjv_nn_bridge = []
                op = Operator()  # 0 = and & 1 = or
                ignore = False  # Useful when have DT like no etc..
                for w in chunk.words:
                    if w.type == "CD":
                        try:
                            op.get()
                            vector["length"] = int(w.string)
                        except ValueError:
                            try:
                                vector["length"] = text2num.text2num(w.string)
                            except text2num.NumberException:
                                pass
                    elif w.type == "CC":
                        ignore = False
                        if w.string.lower() == "and":
                            op.set(0)
                        elif w.string.lower() == "or":
                            op.set(1)
                    elif w.type.startswith("NN"):
                        similarities = [alphabet_similarity(w.string), capital_similarity(w.string), number_similarity(w.string)]
                        m = max(similarities)
                        m_index = similarities.index(m)
                        if m > 0.9 and not ignore:
                            if m_index == 0:
                                if len(adjv_nn_bridge) == 0: adjv_nn_bridge.append(random.choice(list(string.lowercase)))
                                vector["chars"] = operate(vector["chars"], adjv_nn_bridge, op)
                            elif m_index == 1:
                                vector["chars"] = operate(vector["chars"], [random.choice(list(string.uppercase))], op)
                            elif m_index == 2:
                                vector["chars"] = operate(vector["chars"], [random.choice([str(i) for i in range(0, 10)])], op)
                    elif w.type.startswith("JJ"):
                        similarities = [lowercase_similarity(w.string), uppercase_similarity(w.string), special_similarity(w.string)]
                        m = max(similarities)
                        m_index = similarities.index(m)
                        if m > 0.9 and not ignore:
                            if m_index == 0:
                                adjv_nn_bridge = operate(adjv_nn_bridge, [random.choice(list(string.lowercase))], op)
                            elif m_index == 1:
                                adjv_nn_bridge = operate(adjv_nn_bridge, [random.choice(list(string.uppercase))], op)
                            elif m_index == 2:
                                adjv_nn_bridge = operate(adjv_nn_bridge, [random.choice(['!', '$'])], op)
                                if vector["length"] == 0: vector["length"] = 1
                        else:
                            op.get()  # If there is a CC it gets cleaned because we couldn't identify the adjective
                    elif w.type.startswith("DT"):
                        if w.string.lower().startswith("no"):
                            ignore = True

                requirements.append(vector)


    if mandatory and len(requirements) == 0: requirements.append({"length": 1, "chars": ['x']})
    # Handling conjunctions at sentence level
    # Merging vectors based on 'and' and 'or' as of now
    l = []
    last_chunk = None
    for w in t.words:
        if w.chunk == None and w.type.startswith("CC"):
            if w.string.lower() == "or":
                l.append(1)
        elif w.chunk and w.chunk.type == "NP":
            if last_chunk == None or (last_chunk != w.chunk):
                l.append(requirements.pop(0))
                last_chunk = w.chunk

    final = []
    i = 0
    while i < len(l):
        if l[i] == 1:
            i += 2
        else:
            if l[i]["length"] != 0 and len(l[i]["chars"]) > 0: final.append(l[i])
            i += 1
    return(final)