def pkmpopana(): df=pd.read_csv("pkm-19-clean.csv") for i in range(20,27): df=df.append(pd.read_csv(f'pkm-{str(i)}-clean.csv'),ignore_index=True) sen='' for j in df['text']: sen+=j sen=sen.lower() toker=RegexpTokenizer(r'\w+') words=toker.tokenize(sen) stop_words = set(stopwords.words('english')) filtered_sentence = [w for w in words if not w in stop_words] fdist=FreqDist(filtered_sentence) pk=pd.read_csv('pokemon.csv') pk=pk[pk['id']<152] pkmname=list(pk['pokemon']) re={} for n in pkmname: if n in fdist.keys(): re[n]=fdist[n] so=sorted(re.items(),key=lambda item:item[1],reverse = True) l,p=[],[] tar=so[0:2] for i in tar: l.append(i[1]) p.append(i[0]) plt.barh(list(range(len(tar))),width=l[::-1],align='center') plt.xlabel('count') plt.ylabel('name') plt.yticks(list(range(len(tar))),p[::-1]) plt.show()
def _expand_requires_extra(re): for extra, reqs in sorted(re.items()): for req in reqs: if ';' in req: name, envmark = req.split(';', 1) yield '{} ; extra == "{}" and ({})'.format( name, extra, envmark) else: yield '{} ; extra == "{}"'.format(req, extra)
def get_url(self, path): conf = self.get_config() data = self.get_case(path) url_li = [] if "base_url" in conf.keys(): for i in data: re = i["request"] for k, v in re.items(): if k == "URL": value = conf["base_url"] + v url_li.append(value) else: for i in data: re = i["request"] for k, v in re.items(): if k == "URL": url_li.append(v) return url_li
def get_params(self, path): data = self.get_case(path) params_li = [] for i in data: re = i["request"] for k, v in re.items(): if k == "params": params_li.append(v) return params_li
def get_headers(self, path): data = self.get_case(path) headers_li = [] for i in data: re = i["request"] for k, v in re.items(): if k == "headers": headers_li.append(v) return headers_li
def get_method(self, path): data = self.get_case(path) method_li = [] for i in data: re = i["request"] for k, v in re.items(): if k == "method": method_li.append(v) return method_li
# 3. 相似度分析 tfidf[doc_test_vec] # 获取测试文档中,每个词的TF-IDF值 index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys())) # 对每个目标文档,分析测试文档的相似度 sims = index[tfidf[doc_test_vec]] # 整理输出 simss = [] for i in range(len(sims)): if sims[i] != 0: sims[i] += weighList[i] simss.append(sims[i]) # 将每个句子对应的相似度放在列表中 print("最终的结果是(文本和相似度对应):") re = dict(zip(kownledge, simss)) # 将相似度和对应题目组合成为字典 d_order = sorted(re.items(), key=lambda x: x[1], reverse=True) for i in range(10): print(d_order[i], end="") print(kownDict[(d_order[i][0]).encode("utf-8")]) baseKownledge = [] for i in range(3): baseKownledge.append(d_order[i][0]) print(baseKownledge) ''' print("排序后的结果") re2 = sorted(enumerate(sims), key=lambda item: -item[1]) # 根据相似度排序 for i in range(3): print(re2[i]) '''