Exemple #1
0
def remove_stopwords_id(kalimat):
    # ambil stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['daring', 'online', 'nih']

    # menggabungkan stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    string = StopWordRemover(dictionary)
    tokens = nltk.tokenize.word_tokenize(string.remove(kalimat))
    return (" ".join(tokens))
Exemple #2
0
def mapping(list_sentence):
    list_code = []
    list_func = []
    for code in list_sentence:
        _string = ''
        for c in code:
            _string = _string + ' ' + c
        _string = _string[1:]
        list_code.append(_string)
        
    _func_dict = {}
    _variable_dict = {}
    index = 0
    while index < len(list_code):
        string = []
        token = []
        j = 0
        str1 = copy.copy(list_code[index])
        i = 0
        tag = 0
        strtemp = ''
        while i < len(str1):
            if tag == 0:
                if isphor(str1[i], space):
                    if i > 0:
                        string.append(str1[j:i])
                        j = i + 1

                    else:
                        j = i + 1
                    i = i + 1

                elif i + 1 == len(str1):
                    string.append(str1[j:i + 1])
                    break

                elif isphor(str1[i], phla):  
                    if i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '>':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '<' and str1[i + 1] == '<':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '>' and str1[i + 1] == '>':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '&' and str1[i + 1] == '&':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '|' and str1[i + 1] == '|':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '|' and str1[i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '=' and str1[i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '!' and str1[i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '+' and str1[i + 1] == '+':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '-':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '+' and str1[i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif str1[i] == '"':  
                        strtemp = strtemp + str1[i]
                        i = i + 1
                        tag = 1

                    elif str1[i] == '\'':  
                        strtemp = strtemp + str1[i]
                        i = i + 1
                        tag = 2

                    else:
                        string.append(str1[i])
                        j = i + 1
                        i += 1

                else:
                    i += 1
            elif tag == 1:
                if str1[i] != '"':
                    strtemp = strtemp + str1[i]
                    i = i + 1

                else:
                    strtemp = strtemp + str1[i]
                    string.append(strtemp)
                    strtemp = ''
                    tag = 0
                    j = i + 1
                    i += 1

            elif tag == 2:
                if str1[i] != '\'':
                    strtemp = strtemp + str1[i]
                    i = i + 1

                else:
                    strtemp = strtemp + str1[i]
                    string.append(strtemp)
                    strtemp = ''
                    tag = 0
                    j = i + 1
                    i += 1

        count = 0
        for sub in string:
            if sub == spa:
                count += 1

        for i in range(count):
            string.remove('')

        CreateVariable(string, token)

        j = 0
        while j < len(token):
            if token[j] in constValue:
                token[j] = token[j]
                j += 1

            elif j < len(token) and isphor(token[j], variable):  
                if (token[j] in keywords_0) or (token[j] in typewords_0) or (token[j] in typewords_1 or token[j] in typewords_2):  # 是关键字
                    j += 1

                elif j - 1 >= 0 and j + 1 < len(token) and token[j-1] == 'new' and token[j + 1] == '[':
                    j = j + 2

                elif j + 1 < len(token) and token[j + 1] == '(':
                  
                    if token[j] in keywords_1:  
                        j = j + 2

                    elif token[j] in keywords_2:

                        j = j + 2

                    elif isinKeyword_3(token[j]): 
                        
                        j = j + 2

                    elif token[j] in keywords_4: 
                        
                        j = j + 2

                    elif isinKeyword_5(token[j]): 
                        
                        j = j + 2

                    else:
                        if "good" in token[j] or "bad" in token[j]:
                            list_func.append(str(token[j]))
                        if token[j] in _func_dict.keys():
                            token[j] = _func_dict[token[j]]
                        else:
                            list_values = _func_dict.values()
                            if len(list_values) == 0:
                                _func_dict[token[j]] = 'func_0'
                                token[j] = _func_dict[token[j]]
                                
                            else:
                                if token[j] in _func_dict.keys():
                                    token[j] = _func_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _func_dict[token[j]] = 'func_' + str(_max+1)
                                    token[j] = _func_dict[token[j]]
                            j = j + 2

                elif j + 1 < len(token) and (not isphor(token[j + 1], variable)):
                    if token[j + 1] == '*':
                        if j + 2 < len(token) and token[j + 2] == 'const':
                            j = j + 3

                        elif j - 1 >= 0 and token[j - 1] == 'const':
                            j = j + 2

                        elif j - 1 > 0 and (token[j - 1] in operators):
                            list_values = _variable_dict.values()
                            if len(list_values) == 0:
                                _variable_dict[token[j]] = 'variable_0'
                                token[j] = _variable_dict[token[j]]
                                
                            else:
                                if token[j] in _variable_dict.keys():
                                    token[j] = _variable_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _variable_dict[token[j]] = 'variable_' + str(_max+1)
                                    token[j] = _variable_dict[token[j]]
                            j = j + 2

                        elif j + 2 < len(token) and token[j + 2] == ')':
                            j = j + 2

                        elif j - 2 > 0 and (token[j - 1] == '(' and token[j - 2] in operators):  
                            list_values = _variable_dict.values()
                            if len(list_values) == 0:
                                _variable_dict[token[j]] = 'variable_0'
                                token[j] = _variable_dict[token[j]]
                                
                            else:
                                if token[j] in _variable_dict.keys():
                                    token[j] = _variable_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _variable_dict[token[j]] = 'variable_' + str(_max+1)
                                    token[j] = _variable_dict[token[j]]
                            j = j + 2


                        else:
                            list_values = _variable_dict.values()
                            if len(list_values) == 0:
                                _variable_dict[token[j]] = 'variable_0'
                                token[j] = _variable_dict[token[j]]

                            else:
                                if token[j] in _variable_dict.keys():
                                    token[j] = _variable_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _variable_dict[token[j]] = 'variable_' + str(_max+1)
                                    token[j] = _variable_dict[token[j]]

                            j = j + 2

                    else:
                        list_values = _variable_dict.values()
                        if len(list_values) == 0:
                            _variable_dict[token[j]] = 'variable_0'
                            token[j] = _variable_dict[token[j]]
                                
                        else:
                            if token[j] in _variable_dict.keys():
                                token[j] = _variable_dict[token[j]]
                            else:
                                list_num = []
                                for value in list_values:
                                    list_num.append(int(value.split('_')[-1]))

                                _max = max(list_num)
                                _variable_dict[token[j]] = 'variable_' + str(_max+1)
                                token[j] = _variable_dict[token[j]]
                        j = j + 2

                elif j + 1 == len(token):
                    list_values = _variable_dict.values()
                    if len(list_values) == 0:
                        _variable_dict[token[j]] = 'variable_0'
                        token[j] = _variable_dict[token[j]]
                                
                    else:
                        if token[j] in _variable_dict.keys():
                            token[j] = _variable_dict[token[j]]
                        else:
                            list_num = []
                            for value in list_values:
                                list_num.append(int(value.split('_')[-1]))

                            _max = max(list_num)
                            _variable_dict[token[j]] = 'variable_' + str(_max+1)
                            token[j] = _variable_dict[token[j]]
                        break

                else:
                    j += 1

            elif j < len(token) and isphor(token[j], number):  
                j += 1

            elif j < len(token) and isphor(token[j], stringConst): 
                j += 1

            else:
                j += 1

        stemp = ''
        i = 0
        while i < len(token):
            if i == len(token) - 1:
                stemp = stemp + token[i]
            else:
                stemp = stemp + token[i] + ' '
            i += 1

        list_code[index] = stemp
        index += 1

    return list_code, list_func
Exemple #3
0
def nlp(text):
	from thesaurus import Word
	import string

	def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
		import itertools, nltk, string

		# exclude candidates that are stop words or entirely punctuation
		punct = set(string.punctuation)
		stop_words = set(nltk.corpus.stopwords.words('english'))
		# tokenize and POS-tag words
		tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)))
		# filter on certain POS tags and lowercase all words
		candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)]

		return candidates

	def score_keyphrases_by_textrank(text, n_keywords=0.2):
		from itertools import takewhile, tee, izip

		import networkx, nltk

		# tokenize for all words, and extract *candidate* words
		words = [word.lower()
				for sent in nltk.sent_tokenize(text)
				for word in nltk.word_tokenize(sent)]
		candidates = extract_candidate_words(text)
		# build graph, each node is a unique candidate
		graph = networkx.Graph()
		graph.add_nodes_from(set(candidates))
		# iterate over word-pairs, add unweighted edges into graph
		def pairwise(iterable):
			"""s -> (s0,s1), (s1,s2), (s2, s3), ..."""
			a, b = tee(iterable)
			next(b, None)
			return izip(a, b)

		for w1, w2 in pairwise(candidates):
			if w2:
				graph.add_edge(*sorted([w1, w2]))
		# score nodes using default pagerank algorithm, sort by score, keep top n_keywords
		ranks = networkx.pagerank(graph)
		if 0 < n_keywords < 1:
			n_keywords = int(round(len(candidates) * n_keywords))
		word_ranks = {word_rank[0]: word_rank[1] for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
		keywords = set(word_ranks.keys())
		# merge keywords into keyphrases
		keyphrases = {}
		j = 0
		for i, word in enumerate(words):
			if i < j:
				continue
			if word in keywords:
				kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
				avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
				keyphrases[' '.join(kp_words)] = avg_pagerank
				# counter as hackish way to ensure merged keyphrases are non-overlapping
				j = i + len(kp_words)

		return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)

	# text = "The movement of water around our planet is vital to life as it supports plants and animals. Powered by the Sun, the water cycle is happening all the time, though some parts of the cycle take hundreds of years (for example, some of the Earth's water is frozen in polar regions or lying in underground reservoirs and not included in the constant movement of water through evaporation, condensation and precipitation)."

	# text = "The functional and organic CNS consequences of marijuana have received great scrutiny. Marijuana use is well recognized to distort sensory perception and impair motorcoordination, but these acute effects generally clear in 4 to 5 hours. With continued use, these changes may progress to cognitive and psychomotor impairments, such as the inability to judge time, speed, and distance. Among adolescents, such impairment often leads to automobile accidents. Marijuana increases the heart rate and sometimes blood pressure and it may cause angina in a person with coronary artery disease. The lungs are affected by chronic marijuana smoking; laryngitis, pharyngitis, bronchitis, cough, hoarseness, and asthmalike symptoms all have been described, along with mild but significant airway obstruction. Smoking a marijuana cigarette, compared with a tobacco cigarette, is associated with a 3-fold increase in the amount of tar inhaled and retained in the lungs, as a consequence of deeper inhalation and longer breath holding."

	#text = "Histone methylation usually reversibly represses DNA transcription, but can activate it in some cases depending on methylation location. Histone acetylation relaxes DNA coiling, allowing for transcription. DNA methylation at CpG islands represses transcription. A nucleoside is a base and a deoxyribose (sugar). Deamination of cytosine makes uracil. Deamination of adenine makes guanine."

	# print(score_keyphrases_by_textrank(text))
	# print(extract_candidate_words(text))
	string = text.split('.')
	keyphraserank = score_keyphrases_by_textrank(text)
	#print('=================================================')
	#print(text)
	#print('=================================================')
	for phraserank in keyphraserank:
		phrase, rank = phraserank
		#print('************')
		print(phrase)
		#print('************')
		#print(phrase)
		#print(w.synonyms())

		word_of_interest = phrase

		word_of_interest = word_of_interest.lower()
		#word we are replacing
		quiz_sentence = ""

		#for sentence in string:
		#     if word_of_interest in sentence.lower():
		#         quiz_sentence = sentence
		#         print(string)
		#         string.remove(sentence)
		#         print(string)

		i = 0
		while (i < len(string)):
			sentence = string[i]
			if word_of_interest in sentence.lower():
				quiz_sentence = sentence
				string.remove(sentence)
				break
			i+=1


		question_blanked = quiz_sentence.lower().replace(" " + word_of_interest, " ____").strip()
		if len(question_blanked) > 0:
			question_blanked = question_blanked[0].upper() + question_blanked[1:]
			arrQuestions.append((question_blanked, word_of_interest))
Exemple #4
0
def mapping(list_sentence):
    list_code = []
    list_func = []
    for code in list_sentence:
        #print code
        _string = ''
        for c in code:
            _string = _string + ' ' + c
        _string = _string[1:]
        list_code.append(_string)

    #print list_code
    _func_dict = {}
    _variable_dict = {}
    index = 0
    while index < len(list_code):
        string = []
        token = []
        j = 0
        str1 = copy.copy(list_code[index])
        i = 0
        tag = 0
        strtemp = ''
        while i < len(str1):
            if tag == 0:
                if isphor(str1[i], space):  # 遍历字符串,该字符是空格
                    if i > 0:
                        string.append(str1[j:i])
                        j = i + 1

                    else:
                        j = i + 1
                    i = i + 1

                elif i + 1 == len(str1):
                    string.append(str1[j:i + 1])
                    break

                elif isphor(str1[i], phla):  # 遍历字符串,该字符不是是字母数字下划线
                    if i + 1 < len(str1) and str1[i] == '-' and str1[i +
                                                                     1] == '>':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '<' and str1[
                            i + 1] == '<':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '>' and str1[
                            i + 1] == '>':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '&' and str1[
                            i + 1] == '&':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '|' and str1[
                            i + 1] == '|':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '|' and str1[
                            i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '=' and str1[
                            i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '!' and str1[
                            i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '+' and str1[
                            i + 1] == '+':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '-' and str1[
                            i + 1] == '-':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '+' and str1[
                            i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif i + 1 < len(str1) and str1[i] == '-' and str1[
                            i + 1] == '=':
                        string.append(str1[i] + str1[i + 1])
                        j = i + 2
                        i = i + 2

                    elif str1[i] == '"':  # 判断是不是字符串常量
                        strtemp = strtemp + str1[i]
                        i = i + 1
                        tag = 1

                    elif str1[i] == '\'':  # 判断是不是字符串常量
                        strtemp = strtemp + str1[i]
                        i = i + 1
                        tag = 2

                    else:
                        string.append(str1[i])
                        j = i + 1
                        i += 1

                else:
                    i += 1
            elif tag == 1:
                if str1[i] != '"':
                    strtemp = strtemp + str1[i]
                    i = i + 1

                else:
                    strtemp = strtemp + str1[i]
                    string.append(strtemp)
                    strtemp = ''
                    tag = 0
                    j = i + 1
                    i += 1

            elif tag == 2:
                if str1[i] != '\'':
                    strtemp = strtemp + str1[i]
                    i = i + 1

                else:
                    strtemp = strtemp + str1[i]
                    string.append(strtemp)
                    strtemp = ''
                    tag = 0
                    j = i + 1
                    i += 1

        count = 0
        for sub in string:
            if sub == spa:
                count += 1

        for i in range(count):
            string.remove('')

        CreateVariable(string, token)

        j = 0
        while j < len(token):
            if token[j] in constValue:
                token[j] = token[j]
                j += 1

            elif j < len(token) and isphor(token[j], variable):  # 如果满足变量的条件
                if (token[j] in keywords_0) or (token[j] in typewords_0) or (
                        token[j] in typewords_1
                        or token[j] in typewords_2):  # 是关键字
                    j += 1

                elif j - 1 >= 0 and j + 1 < len(token) and token[
                        j - 1] == 'new' and token[j + 1] == '[':
                    j = j + 2

                elif j + 1 < len(token) and token[
                        j + 1] == '(':  # 满足变量,但是下一个token是(,判断是不是函数
                    #print(token[j])
                    if token[
                            j] in keywords_1:  # 判断是否是if等结构,但是在PDG的code里面不会出现if,for,switch等情况
                        j = j + 2

                    elif token[j] in keywords_2:  # 判断是否是敏感函数,敏感函数不映射
                        #print('3', token[j])
                        j = j + 2

                    elif isinKeyword_3(token[j]):  # 判断是否满足敏感函数的通配符结构
                        #print('4', token[j])
                        j = j + 2

                    elif token[j] in keywords_4:  #判断是否是C/C++的库函数和API
                        #print('5', token[j])
                        j = j + 2

                    elif isinKeyword_5(token[j]):  # 判断是否满足malloc函数的通配符结构
                        #print('6', token[j])
                        j = j + 2

                    else:  #自定义函数
                        #print('7',token[j])
                        if "good" in token[j] or "bad" in token[j]:
                            list_func.append(str(token[j]))
                        if token[j] in _func_dict.keys():
                            token[j] = _func_dict[token[j]]
                        else:
                            list_values = _func_dict.values()
                            if len(list_values) == 0:
                                _func_dict[token[j]] = 'func_0'
                                token[j] = _func_dict[token[j]]

                            else:
                                if token[j] in _func_dict.keys():
                                    token[j] = _func_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(
                                            int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _func_dict[token[j]] = 'func_' + str(_max +
                                                                         1)
                                    token[j] = _func_dict[token[j]]
                            j = j + 2

                elif j + 1 < len(token) and (not isphor(
                        token[j + 1], variable)):  # 满足变量,但是下一个token不是变量
                    if token[j + 1] == '*':
                        if j + 2 < len(token) and token[j + 2] == 'const':
                            j = j + 3

                        elif j - 1 >= 0 and token[j - 1] == 'const':
                            j = j + 2

                        elif j - 1 > 0 and (token[j - 1]
                                            in operators):  # 算术表达式中的变量
                            list_values = _variable_dict.values()
                            if len(list_values) == 0:
                                _variable_dict[token[j]] = 'variable_0'
                                token[j] = _variable_dict[token[j]]

                            else:
                                if token[j] in _variable_dict.keys():
                                    token[j] = _variable_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(
                                            int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _variable_dict[
                                        token[j]] = 'variable_' + str(_max + 1)
                                    token[j] = _variable_dict[token[j]]
                            j = j + 2

                        elif j + 2 < len(token) and token[j + 2] == ')':
                            j = j + 2

                        elif j - 2 > 0 and (token[j - 1] == '('
                                            and token[j - 2]
                                            in operators):  # 算术表达式中的变量
                            list_values = _variable_dict.values()
                            if len(list_values) == 0:
                                _variable_dict[token[j]] = 'variable_0'
                                token[j] = _variable_dict[token[j]]

                            else:
                                if token[j] in _variable_dict.keys():
                                    token[j] = _variable_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(
                                            int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _variable_dict[
                                        token[j]] = 'variable_' + str(_max + 1)
                                    token[j] = _variable_dict[token[j]]
                            j = j + 2

                        else:
                            list_values = _variable_dict.values()
                            if len(list_values) == 0:
                                _variable_dict[token[j]] = 'variable_0'
                                token[j] = _variable_dict[token[j]]

                            else:
                                if token[j] in _variable_dict.keys():
                                    token[j] = _variable_dict[token[j]]
                                else:
                                    list_num = []
                                    for value in list_values:
                                        list_num.append(
                                            int(value.split('_')[-1]))

                                    _max = max(list_num)
                                    _variable_dict[
                                        token[j]] = 'variable_' + str(_max + 1)
                                    token[j] = _variable_dict[token[j]]

                            j = j + 2

                    else:
                        list_values = _variable_dict.values()
                        if len(list_values) == 0:
                            _variable_dict[token[j]] = 'variable_0'
                            token[j] = _variable_dict[token[j]]

                        else:
                            if token[j] in _variable_dict.keys():
                                token[j] = _variable_dict[token[j]]
                            else:
                                list_num = []
                                for value in list_values:
                                    list_num.append(int(value.split('_')[-1]))

                                _max = max(list_num)
                                _variable_dict[
                                    token[j]] = 'variable_' + str(_max + 1)
                                token[j] = _variable_dict[token[j]]
                        j = j + 2

                elif j + 1 == len(token):
                    list_values = _variable_dict.values()
                    if len(list_values) == 0:
                        _variable_dict[token[j]] = 'variable_0'
                        token[j] = _variable_dict[token[j]]

                    else:
                        if token[j] in _variable_dict.keys():
                            token[j] = _variable_dict[token[j]]
                        else:
                            list_num = []
                            for value in list_values:
                                list_num.append(int(value.split('_')[-1]))

                            _max = max(list_num)
                            _variable_dict[token[j]] = 'variable_' + str(_max +
                                                                         1)
                            token[j] = _variable_dict[token[j]]
                        break

                else:
                    j += 1

            elif j < len(token) and isphor(token[j], number):  # 如果满足常量的条件
                j += 1

            elif j < len(token) and isphor(token[j], stringConst):  # 如果满足字符串常量
                j += 1

            else:
                j += 1

        stemp = ''
        i = 0
        while i < len(token):
            if i == len(token) - 1:
                stemp = stemp + token[i]
            else:
                stemp = stemp + token[i] + ' '
            i += 1

        list_code[index] = stemp
        index += 1

    #print list_code
    #print _variable_dict
    return list_code
Exemple #5
0
def create_tokens(sentence):
    formal = '^[_a-zA-Z][_a-zA-Z0-9]*$'
    phla = '[^_a-zA-Z0-9]'
    space = '\s'
    spa = ''
    string = []
    j = 0
    str = sentence
    i = 0

    while (i < len(str)):
        if isphor(str[i], space):
            if i > j:
                string.append(str[j:i])
                j = i + 1
            else:
                j = i + 1

        elif isphor(str[i], phla):
            if (i + 1 < len(str)) and isphor(str[i + 1], phla):
                m = doubisphor(str[i], str[i + 1])

                if m:
                    string1 = str[i] + str[i + 1]

                    if (i + 2 < len(str)) and (isphor(str[i + 2], phla)):
                        if trisphor(string1, str[i + 2]):
                            string.append(str[j:i])
                            string.append(str[i] + str[i + 1] + str[i + 2])
                            j = i + 3
                            i = i + 2

                        else:
                            string.append(str[j:i])
                            string.append(str[i] + str[i + 1])
                            string.append(str[i + 2])
                            j = i + 3
                            i = i + 2

                    else:
                        string.append(str[j:i])
                        string.append(str[i] + str[i + 1])
                        j = i + 2
                        i = i + 1

                else:
                    string.append(str[j:i])
                    string.append(str[i])
                    string.append(str[i + 1])
                    j = i + 2
                    i = i + 1

            else:
                string.append(str[j:i])
                string.append(str[i])
                j = i + 1

        i = i + 1

    count = 0
    count1 = 0
    sub0 = '\r'

    if sub0 in string:
        string.remove('\r')

    for sub1 in string:
        if sub1 == ' ':
            count1 = count1 + 1

    for j in range(count1):
        string.remove(' ')

    for sub in string:
        if sub == spa:
            count = count + 1

    for i in range(count):
        string.remove('')

    return string