def remove_stopwords_id(kalimat): # ambil stopword bawaan stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['daring', 'online', 'nih'] # menggabungkan stopword data = stop_factory + more_stopword dictionary = ArrayDictionary(data) string = StopWordRemover(dictionary) tokens = nltk.tokenize.word_tokenize(string.remove(kalimat)) return (" ".join(tokens))
def mapping(list_sentence): list_code = [] list_func = [] for code in list_sentence: _string = '' for c in code: _string = _string + ' ' + c _string = _string[1:] list_code.append(_string) _func_dict = {} _variable_dict = {} index = 0 while index < len(list_code): string = [] token = [] j = 0 str1 = copy.copy(list_code[index]) i = 0 tag = 0 strtemp = '' while i < len(str1): if tag == 0: if isphor(str1[i], space): if i > 0: string.append(str1[j:i]) j = i + 1 else: j = i + 1 i = i + 1 elif i + 1 == len(str1): string.append(str1[j:i + 1]) break elif isphor(str1[i], phla): if i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '>': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '<' and str1[i + 1] == '<': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '>' and str1[i + 1] == '>': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '&' and str1[i + 1] == '&': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '|' and str1[i + 1] == '|': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '|' and str1[i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '=' and str1[i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '!' and str1[i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '+' and str1[i + 1] == '+': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '-': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '+' and str1[i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif str1[i] == '"': strtemp = strtemp + str1[i] i = i + 1 tag = 1 elif str1[i] == '\'': strtemp = strtemp + str1[i] i = i + 1 tag = 2 else: string.append(str1[i]) j = i + 1 i += 1 else: i += 1 elif tag == 1: if str1[i] != '"': strtemp = strtemp + str1[i] i = i + 1 else: strtemp = strtemp + str1[i] string.append(strtemp) strtemp = '' tag = 0 j = i + 1 i += 1 elif tag == 2: if str1[i] != '\'': strtemp = strtemp + str1[i] i = i + 1 else: strtemp = strtemp + str1[i] string.append(strtemp) strtemp = '' tag = 0 j = i + 1 i += 1 count = 0 for sub in string: if sub == spa: count += 1 for i in range(count): string.remove('') CreateVariable(string, token) j = 0 while j < len(token): if token[j] in constValue: token[j] = token[j] j += 1 elif j < len(token) and isphor(token[j], variable): if (token[j] in keywords_0) or (token[j] in typewords_0) or (token[j] in typewords_1 or token[j] in typewords_2): # 是关键字 j += 1 elif j - 1 >= 0 and j + 1 < len(token) and token[j-1] == 'new' and token[j + 1] == '[': j = j + 2 elif j + 1 < len(token) and token[j + 1] == '(': if token[j] in keywords_1: j = j + 2 elif token[j] in keywords_2: j = j + 2 elif isinKeyword_3(token[j]): j = j + 2 elif token[j] in keywords_4: j = j + 2 elif isinKeyword_5(token[j]): j = j + 2 else: if "good" in token[j] or "bad" in token[j]: list_func.append(str(token[j])) if token[j] in _func_dict.keys(): token[j] = _func_dict[token[j]] else: list_values = _func_dict.values() if len(list_values) == 0: _func_dict[token[j]] = 'func_0' token[j] = _func_dict[token[j]] else: if token[j] in _func_dict.keys(): token[j] = _func_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _func_dict[token[j]] = 'func_' + str(_max+1) token[j] = _func_dict[token[j]] j = j + 2 elif j + 1 < len(token) and (not isphor(token[j + 1], variable)): if token[j + 1] == '*': if j + 2 < len(token) and token[j + 2] == 'const': j = j + 3 elif j - 1 >= 0 and token[j - 1] == 'const': j = j + 2 elif j - 1 > 0 and (token[j - 1] in operators): list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[token[j]] = 'variable_' + str(_max+1) token[j] = _variable_dict[token[j]] j = j + 2 elif j + 2 < len(token) and token[j + 2] == ')': j = j + 2 elif j - 2 > 0 and (token[j - 1] == '(' and token[j - 2] in operators): list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[token[j]] = 'variable_' + str(_max+1) token[j] = _variable_dict[token[j]] j = j + 2 else: list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[token[j]] = 'variable_' + str(_max+1) token[j] = _variable_dict[token[j]] j = j + 2 else: list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[token[j]] = 'variable_' + str(_max+1) token[j] = _variable_dict[token[j]] j = j + 2 elif j + 1 == len(token): list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[token[j]] = 'variable_' + str(_max+1) token[j] = _variable_dict[token[j]] break else: j += 1 elif j < len(token) and isphor(token[j], number): j += 1 elif j < len(token) and isphor(token[j], stringConst): j += 1 else: j += 1 stemp = '' i = 0 while i < len(token): if i == len(token) - 1: stemp = stemp + token[i] else: stemp = stemp + token[i] + ' ' i += 1 list_code[index] = stemp index += 1 return list_code, list_func
def nlp(text): from thesaurus import Word import string def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)] return candidates def score_keyphrases_by_textrank(text, n_keywords=0.2): from itertools import takewhile, tee, izip import networkx, nltk # tokenize for all words, and extract *candidate* words words = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] candidates = extract_candidate_words(text) # build graph, each node is a unique candidate graph = networkx.Graph() graph.add_nodes_from(set(candidates)) # iterate over word-pairs, add unweighted edges into graph def pairwise(iterable): """s -> (s0,s1), (s1,s2), (s2, s3), ...""" a, b = tee(iterable) next(b, None) return izip(a, b) for w1, w2 in pairwise(candidates): if w2: graph.add_edge(*sorted([w1, w2])) # score nodes using default pagerank algorithm, sort by score, keep top n_keywords ranks = networkx.pagerank(graph) if 0 < n_keywords < 1: n_keywords = int(round(len(candidates) * n_keywords)) word_ranks = {word_rank[0]: word_rank[1] for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]} keywords = set(word_ranks.keys()) # merge keywords into keyphrases keyphrases = {} j = 0 for i, word in enumerate(words): if i < j: continue if word in keywords: kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10])) avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words)) keyphrases[' '.join(kp_words)] = avg_pagerank # counter as hackish way to ensure merged keyphrases are non-overlapping j = i + len(kp_words) return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True) # text = "The movement of water around our planet is vital to life as it supports plants and animals. Powered by the Sun, the water cycle is happening all the time, though some parts of the cycle take hundreds of years (for example, some of the Earth's water is frozen in polar regions or lying in underground reservoirs and not included in the constant movement of water through evaporation, condensation and precipitation)." # text = "The functional and organic CNS consequences of marijuana have received great scrutiny. Marijuana use is well recognized to distort sensory perception and impair motorcoordination, but these acute effects generally clear in 4 to 5 hours. With continued use, these changes may progress to cognitive and psychomotor impairments, such as the inability to judge time, speed, and distance. Among adolescents, such impairment often leads to automobile accidents. Marijuana increases the heart rate and sometimes blood pressure and it may cause angina in a person with coronary artery disease. The lungs are affected by chronic marijuana smoking; laryngitis, pharyngitis, bronchitis, cough, hoarseness, and asthmalike symptoms all have been described, along with mild but significant airway obstruction. Smoking a marijuana cigarette, compared with a tobacco cigarette, is associated with a 3-fold increase in the amount of tar inhaled and retained in the lungs, as a consequence of deeper inhalation and longer breath holding." #text = "Histone methylation usually reversibly represses DNA transcription, but can activate it in some cases depending on methylation location. Histone acetylation relaxes DNA coiling, allowing for transcription. DNA methylation at CpG islands represses transcription. A nucleoside is a base and a deoxyribose (sugar). Deamination of cytosine makes uracil. Deamination of adenine makes guanine." # print(score_keyphrases_by_textrank(text)) # print(extract_candidate_words(text)) string = text.split('.') keyphraserank = score_keyphrases_by_textrank(text) #print('=================================================') #print(text) #print('=================================================') for phraserank in keyphraserank: phrase, rank = phraserank #print('************') print(phrase) #print('************') #print(phrase) #print(w.synonyms()) word_of_interest = phrase word_of_interest = word_of_interest.lower() #word we are replacing quiz_sentence = "" #for sentence in string: # if word_of_interest in sentence.lower(): # quiz_sentence = sentence # print(string) # string.remove(sentence) # print(string) i = 0 while (i < len(string)): sentence = string[i] if word_of_interest in sentence.lower(): quiz_sentence = sentence string.remove(sentence) break i+=1 question_blanked = quiz_sentence.lower().replace(" " + word_of_interest, " ____").strip() if len(question_blanked) > 0: question_blanked = question_blanked[0].upper() + question_blanked[1:] arrQuestions.append((question_blanked, word_of_interest))
def mapping(list_sentence): list_code = [] list_func = [] for code in list_sentence: #print code _string = '' for c in code: _string = _string + ' ' + c _string = _string[1:] list_code.append(_string) #print list_code _func_dict = {} _variable_dict = {} index = 0 while index < len(list_code): string = [] token = [] j = 0 str1 = copy.copy(list_code[index]) i = 0 tag = 0 strtemp = '' while i < len(str1): if tag == 0: if isphor(str1[i], space): # 遍历字符串,该字符是空格 if i > 0: string.append(str1[j:i]) j = i + 1 else: j = i + 1 i = i + 1 elif i + 1 == len(str1): string.append(str1[j:i + 1]) break elif isphor(str1[i], phla): # 遍历字符串,该字符不是是字母数字下划线 if i + 1 < len(str1) and str1[i] == '-' and str1[i + 1] == '>': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '<' and str1[ i + 1] == '<': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '>' and str1[ i + 1] == '>': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '&' and str1[ i + 1] == '&': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '|' and str1[ i + 1] == '|': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '|' and str1[ i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '=' and str1[ i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '!' and str1[ i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '+' and str1[ i + 1] == '+': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '-' and str1[ i + 1] == '-': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '+' and str1[ i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif i + 1 < len(str1) and str1[i] == '-' and str1[ i + 1] == '=': string.append(str1[i] + str1[i + 1]) j = i + 2 i = i + 2 elif str1[i] == '"': # 判断是不是字符串常量 strtemp = strtemp + str1[i] i = i + 1 tag = 1 elif str1[i] == '\'': # 判断是不是字符串常量 strtemp = strtemp + str1[i] i = i + 1 tag = 2 else: string.append(str1[i]) j = i + 1 i += 1 else: i += 1 elif tag == 1: if str1[i] != '"': strtemp = strtemp + str1[i] i = i + 1 else: strtemp = strtemp + str1[i] string.append(strtemp) strtemp = '' tag = 0 j = i + 1 i += 1 elif tag == 2: if str1[i] != '\'': strtemp = strtemp + str1[i] i = i + 1 else: strtemp = strtemp + str1[i] string.append(strtemp) strtemp = '' tag = 0 j = i + 1 i += 1 count = 0 for sub in string: if sub == spa: count += 1 for i in range(count): string.remove('') CreateVariable(string, token) j = 0 while j < len(token): if token[j] in constValue: token[j] = token[j] j += 1 elif j < len(token) and isphor(token[j], variable): # 如果满足变量的条件 if (token[j] in keywords_0) or (token[j] in typewords_0) or ( token[j] in typewords_1 or token[j] in typewords_2): # 是关键字 j += 1 elif j - 1 >= 0 and j + 1 < len(token) and token[ j - 1] == 'new' and token[j + 1] == '[': j = j + 2 elif j + 1 < len(token) and token[ j + 1] == '(': # 满足变量,但是下一个token是(,判断是不是函数 #print(token[j]) if token[ j] in keywords_1: # 判断是否是if等结构,但是在PDG的code里面不会出现if,for,switch等情况 j = j + 2 elif token[j] in keywords_2: # 判断是否是敏感函数,敏感函数不映射 #print('3', token[j]) j = j + 2 elif isinKeyword_3(token[j]): # 判断是否满足敏感函数的通配符结构 #print('4', token[j]) j = j + 2 elif token[j] in keywords_4: #判断是否是C/C++的库函数和API #print('5', token[j]) j = j + 2 elif isinKeyword_5(token[j]): # 判断是否满足malloc函数的通配符结构 #print('6', token[j]) j = j + 2 else: #自定义函数 #print('7',token[j]) if "good" in token[j] or "bad" in token[j]: list_func.append(str(token[j])) if token[j] in _func_dict.keys(): token[j] = _func_dict[token[j]] else: list_values = _func_dict.values() if len(list_values) == 0: _func_dict[token[j]] = 'func_0' token[j] = _func_dict[token[j]] else: if token[j] in _func_dict.keys(): token[j] = _func_dict[token[j]] else: list_num = [] for value in list_values: list_num.append( int(value.split('_')[-1])) _max = max(list_num) _func_dict[token[j]] = 'func_' + str(_max + 1) token[j] = _func_dict[token[j]] j = j + 2 elif j + 1 < len(token) and (not isphor( token[j + 1], variable)): # 满足变量,但是下一个token不是变量 if token[j + 1] == '*': if j + 2 < len(token) and token[j + 2] == 'const': j = j + 3 elif j - 1 >= 0 and token[j - 1] == 'const': j = j + 2 elif j - 1 > 0 and (token[j - 1] in operators): # 算术表达式中的变量 list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append( int(value.split('_')[-1])) _max = max(list_num) _variable_dict[ token[j]] = 'variable_' + str(_max + 1) token[j] = _variable_dict[token[j]] j = j + 2 elif j + 2 < len(token) and token[j + 2] == ')': j = j + 2 elif j - 2 > 0 and (token[j - 1] == '(' and token[j - 2] in operators): # 算术表达式中的变量 list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append( int(value.split('_')[-1])) _max = max(list_num) _variable_dict[ token[j]] = 'variable_' + str(_max + 1) token[j] = _variable_dict[token[j]] j = j + 2 else: list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append( int(value.split('_')[-1])) _max = max(list_num) _variable_dict[ token[j]] = 'variable_' + str(_max + 1) token[j] = _variable_dict[token[j]] j = j + 2 else: list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[ token[j]] = 'variable_' + str(_max + 1) token[j] = _variable_dict[token[j]] j = j + 2 elif j + 1 == len(token): list_values = _variable_dict.values() if len(list_values) == 0: _variable_dict[token[j]] = 'variable_0' token[j] = _variable_dict[token[j]] else: if token[j] in _variable_dict.keys(): token[j] = _variable_dict[token[j]] else: list_num = [] for value in list_values: list_num.append(int(value.split('_')[-1])) _max = max(list_num) _variable_dict[token[j]] = 'variable_' + str(_max + 1) token[j] = _variable_dict[token[j]] break else: j += 1 elif j < len(token) and isphor(token[j], number): # 如果满足常量的条件 j += 1 elif j < len(token) and isphor(token[j], stringConst): # 如果满足字符串常量 j += 1 else: j += 1 stemp = '' i = 0 while i < len(token): if i == len(token) - 1: stemp = stemp + token[i] else: stemp = stemp + token[i] + ' ' i += 1 list_code[index] = stemp index += 1 #print list_code #print _variable_dict return list_code
def create_tokens(sentence): formal = '^[_a-zA-Z][_a-zA-Z0-9]*$' phla = '[^_a-zA-Z0-9]' space = '\s' spa = '' string = [] j = 0 str = sentence i = 0 while (i < len(str)): if isphor(str[i], space): if i > j: string.append(str[j:i]) j = i + 1 else: j = i + 1 elif isphor(str[i], phla): if (i + 1 < len(str)) and isphor(str[i + 1], phla): m = doubisphor(str[i], str[i + 1]) if m: string1 = str[i] + str[i + 1] if (i + 2 < len(str)) and (isphor(str[i + 2], phla)): if trisphor(string1, str[i + 2]): string.append(str[j:i]) string.append(str[i] + str[i + 1] + str[i + 2]) j = i + 3 i = i + 2 else: string.append(str[j:i]) string.append(str[i] + str[i + 1]) string.append(str[i + 2]) j = i + 3 i = i + 2 else: string.append(str[j:i]) string.append(str[i] + str[i + 1]) j = i + 2 i = i + 1 else: string.append(str[j:i]) string.append(str[i]) string.append(str[i + 1]) j = i + 2 i = i + 1 else: string.append(str[j:i]) string.append(str[i]) j = i + 1 i = i + 1 count = 0 count1 = 0 sub0 = '\r' if sub0 in string: string.remove('\r') for sub1 in string: if sub1 == ' ': count1 = count1 + 1 for j in range(count1): string.remove(' ') for sub in string: if sub == spa: count = count + 1 for i in range(count): string.remove('') return string