def get_software_name_and_version_from_cpe(cpe): # print(cpe) parts = cpe.split(':')[2:] # print(parts) software, version = '', '' num_found = False idx = 0 for part in parts: if not utils.contain_letter(part) and not utils.contain_number(part): break part = part.replace('_', ' ').replace('~', ' ') if part[0].isdigit(): version += part + ' ' num_found = True elif num_found: version += part + ' ' else: software += part + ' ' idx += 1 software = software.strip() version = version.strip() if version == '': software, version = extract_windows_version(software) if software == '': software, version = corner_case(version) return software, version
def judge_word_in_sentence_with_both_dots_and_numbers(sent): words = nltk.word_tokenize(sent) for word in words: contains_number = utils.contain_number(word) contains_dot = '.' in word if contains_number and contains_dot: return True return False
def corner_case(version): # '1024cms 1024 cms 1.4.2 beta' version_split = version.split() version_split.reverse() idx = 0 for word in version_split: if utils.contain_number(word): version_split.reverse() return ' '.join(version_split[:-idx - 1]), ' '.join( version_split[-idx - 1:]) idx += 1 return '', version
def clean_redundant_words_and_reserve_range(before_clean_list): # preserve words that are # (1) in the cve word set or # (2) contain numbers or # (3) do not contain number and letter or # (4) in range word set # todo: enrich range word set range_word_set = {'before', 'older', 'prior', 'up', 'to', 'through', 'and', 'earlier', 'upper', 'higher', 'lower', 'including', 'since', 'onwards'} after_clean_set = set() for version_str in before_clean_list: clean_version_str = '' version_str_word_list = version_str.split() for word in version_str_word_list: if utils.contain_number(word) or (not utils.contain_number(word) and not utils.contain_letter( word)) or word in range_word_set: clean_version_str += word + ' ' after_clean_set.add(clean_version_str.strip()) after_clean_set -= {''} # if '' in after_clean_set: # after_clean_set.remove('') return list(after_clean_set)
def extract_pair_from_edb_title(raw_title): title_dict = {} if raw_title.find(' - ') != -1: # software_in_title might contain version software_in_title = raw_title.split(' - ')[0] contains_number = utils.contain_number(software_in_title) if contains_number: content_line_version, content_line_software = get_pair_from_content_line_focus_official( raw_title) content_line_version = content_line_version[:content_line_version. find(' - ')] content_line_version, content_line_software = move_range_from_software_to_version( content_line_version, content_line_software) title_dict = { encode_content(content_line_software): encode_content(content_line_version) } return title_dict
def get_pair_from_content_line_focus_official(line): line = line.lower() con_word_list = line.split() content_line_version, content_line_software = '', '' keyword_software_loc = 0 # contains keyword software word_idx = 0 mat1 = False version_loc = -1 for word in con_word_list: if word == '': continue if word in ['windows', 'office']: version_loc = word_idx + 1 mat1 = True break word_idx += 1 if mat1: content_line_version = get_right_part(con_word_list, version_loc) content_line_software = get_left_part(con_word_list, version_loc) else: mat1 = False # the n-th word is version version_loc = 0 word_idx = 0 for word in con_word_list: if word == '': continue mat1 = re.match(r'(v)?[\d]{1,2}((\.[\d]{1,2}){1,2}(\.x)?|\.x)', word) if mat1: version_loc = word_idx break word_idx += 1 # contains 1.1.x format number if version_loc != 0: content_line_version = get_right_part(con_word_list, version_loc) content_line_software = get_left_part(con_word_list, version_loc) # content_line_software = con[:con.find(content_line_version)].strip() # print(content_line_software, ' ||| ', content_line_version) # print(con) # print() # find word that is a number else: version_loc = 0 word_idx = 0 for word in con_word_list: if word == '': continue contains_number = utils.contain_number(word) if contains_number and word.lower not in [ 'x64', 'x86', 'x86_64' ]: version_loc = word_idx break word_idx += 1 content_line_version = get_right_part(con_word_list, version_loc) content_line_software = get_left_part(con_word_list, version_loc) # print(content_line_software, ' ||| ', content_line_version) # print(con) # print() if content_line_version in content_line_software: print( 'ERROR if content_line_version in content_line_software:') return content_line_version.lower(), remove_duplicate_word_from_software( content_line_software).lower()
def remove_space_in_focus_version(version_str): version_split = version_str.split() if len(version_split) == 2: if version_split[1][0] == '.' and utils.contain_number(version_split[1][1:]): version_str = version_str.replace(' ', '') return version_str