def search_DDG_DORKS(TITLE, TEXT_0): engine = Duckduckgo() for FC_domain in config.FC_list: results = engine.search(f"site:{FC_domain} {TITLE}") for r in results: print("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"]) try: tsd, td, tsu = extract(r["link"]) domain = td + '.' + tsu web = requests.get(r["link"], timeout=3) print("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n") if web.status_code >= 200 or web.status_code < 300: if ".pdf" in r["link"]: pass else: if not domain in config.BL_parserPhone: TEXT = er.remove_tags(str(web.text)) compareTEXT(TEXT, TEXT_0) parser.FC_words_in_text(TEXT) parser.parserMAIN(TEXT) ratio = compareTEXT(TEXT_0, TEXT) print( f"|----[INFO][COMPARE TEXTS][>] Ratio: {ratio}" ) #Guardamos la info en un log data = f"{r['title']} ||| {r['link']} ||| {r['text']}, ||| {ratio} \n" generateLOG(data, target) else: pass print("") time.sleep(2) except Exception as e: print("|----[ERROR][HTTP CONNECTION][>] " + str(e))
def news_parser(url, target): #Descargamos la noticia article = Article(url, language = 'es') article.download() #Parseamos la noticia article.parse() #La guardamos e imprimimos print(f"|----[INFO][WEB][>] {article.title}") print(f"|--------[INFO][WEB][AUTHORS][>] {article.authors}") print(f"|--------[INFO][WEB][PUBLISH DATE][>] {article.publish_date}") parser.parser_email(article.text) parser.parser_DNI(article.text) parser.parser_IBAN(article.text) parser.parser_n_tlfn(article.text) parser.FC_words_in_text(article.text) print(f"|--------[INFO][WEB][URL][>] {url}") news_insertMongoDB(target, url, article.title, article.authors, article.text, article.publish_date, article.top_image, article.movies, article.html)