def down(url: str): # 获取网页 r = HTMLSession().get(url).html # 标题 top = r.find('.bookname h1')[0].text top = top.split() if len(top) > 1: index = top[1].find('(') if index != -1: top = top[0] + ' ' + top[1][:index] else: top = ' '.join(top) else: top = top[0] # 获取内容 text = r.find('#content')[0].text # 写入文件 txtTop = saveDir + top + '.txt' try: if os.path.exists(txtTop): print(top + ' 文件已存在!') else: with open(txtTop, 'w') as f: f.write(text) print(top + ' 写入成功!') except: print('文件写入错误!')
def retrieveCWEURLFromCVE(cve_full_url=None, cve_id=None): """ Please specify either cve_full_url OR cve_id Priority non-null arg: cve_full_url Examples: cve_full_url=https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-7589 OR cve_id=CVE-2015-7589 """ if (not cve_full_url is None) and ( not cve_full_url.startswith('https://cve.mitre.org/')): raise ValueError('Invalid CVE url') if (cve_full_url is None) and (cve_id is None): return None url = '' if not (cve_full_url is None): url = cve_full_url cve_regex_test = re.search("CVE-[0-9]+-[0-9]+", url) if cve_regex_test: cve_id = cve_regex_test.group() else: cve_id = url else: url = CVE_BASE_PATH + cve_id print("[retrieveCWEURLFromCVE] Parsing CVE: " + cve_id + ", from url: " + url) cve_response = HTMLSession().get(url).html cve_url = cve_response.find( "div#GeneratedTable .ltgreybackground .larger a", first=True).attrs['href'] cve_response = session.get(cve_url).html if not cve_response.find("div#vulnTechnicalDetailsDiv td a", first=True) is None: cwe_url = cve_response.find("div#vulnTechnicalDetailsDiv td a", first=True).attrs['href'] else: cwe_url = '' if PRINT_DEBUG is True: print("[retrieveCWEURLFromCVE] CWE Url of " + cve_id + ": " + cwe_url) return cwe_url
def get_full_article(self): try: r = HTMLSession().get(url=self.url) r = r.html.find('article', first=True) to_return = '' for p in r.find('p'): to_return = to_return + p.text + '\n' return to_return except Exception as e: article_logger.error(f'Probléma a cikk kinyerésekor:\n{e}') return None
def _parse_data_lists(html: HTMLSession): finn_codes = [] data_lists = html.find('article') for el in data_lists: finn_codes.append(el.find('a')[0].attrs["href"].split("=")[-1]) return finn_codes