def search_relatives(query_str, ignore_url=""): """ Procura no Bing, posts similares com >50 de pontuação no token_sort_ratio limitado a 50 resultados :param ignore_url: URL para ignorar notícias :param query_str: conteúdo a ser buscado (Pode ser um título ou o conteúdo da postagem) :return: dicionário com um dicionário onde 'relatives' é uma array de dicionários dos dados dos posts similares, e 'score' é uma média do token_sort_ratio dos resultados similares. """ rel_response = dict(relatives=[]) meta_score = 0 if "MS_BING_KEY" not in os.environ: raise BeaverError( "Chaves da Microsoft devem estar presentes na variável do sistema MS_BING_KEY" ) try: results = PyMsCognitiveNewsSearch(os.environ.get("MS_BING_KEY"), normalize(query_str), custom_params={ "mkt": settings['language'], "setLang": settings['language'][:2] }).search(limit=50, format='json') log.info("Encontrado " + str(len(results)) + " resultados.") except Exception as e: raise BeaverError( "Não foi possível se comunicar com o Bing, talvez as chaves tenham expirado? [" + str(e) + "]") for result in results: log.info("Analisando: " + str(result.name) + ". Token sort: " + str(fuzz.token_sort_ratio(query_str, result.name))) if True: log.info("Achado compatível." + str(fuzz.token_sort_ratio(query_str, result.name)) + " " + result.name) try: # Em caso de erros do Goose, não são relevantes quais (Variam de 404 e 500) if ignore_url in result.url and ignore_url is not None: raise BeaverError( "URL inválida (não pode ser de mesmo domínio). " + ignore_url + " = " + result.url) dados = extract(result.url) dados['date'] = pendulum.parse(result.date_published, tz=settings['timezone']) rel_response['relatives'].append(dados) meta_score += fuzz.token_sort_ratio(query_str, result.name) except Exception as e: log.error("Erro: " + str(e)) pass log.info("Relativos: " + str(rel_response['relatives'])) if meta_score > 0: rel_response['score'] = meta_score / len(rel_response['relatives']) else: rel_response['score'] = meta_score log.info("Retornando " + str(rel_response)) return rel_response
import config import json from py_ms_cognitive import PyMsCognitiveNewsSearch from time import sleep topic = input("Enter in the topic: ") print("\nGathering news articles about "+topic) #Search for articles using Bing's News Search API search_service = PyMsCognitiveNewsSearch(config.bing_search_api_key, topic) articles = search_service.search(limit=50,format='json') #first 50 articles try: with open(topic.title().replace(' ', '')+'.json', 'a') as f: for article in articles: f.write(json.dumps(article.json, indent=4, sort_keys=True)+'\n') print("Saving article with title: " + article.name) sleep(0.2) # add a little delay for saucy complexity ;) except BaseException as e: print("Error on_data: %s\n" % str(e))
def test_search_all(self): web_bing = PyMsCognitiveNewsSearch(SECRET_KEY, "Python") result_one = web_bing.search_all(quota=60) self.assertTrue(len(result_one) == 60) self.assertTrue("python" in result_one[0].name.lower())
def test_can_search(self): web_bing = PyMsCognitiveNewsSearch(SECRET_KEY, "Python") result_one = web_bing.search(limit=50) self.assertTrue(len(result_one) > 0) self.assertTrue("python" in result_one[0].name.lower())
def get_search(search_term, k): search_service = PyMsCognitiveNewsSearch(API_KEY, search_term) result = search_service.search(limit=k, format='json') return result
from py_ms_cognitive import PyMsCognitiveNewsSearch import pandas as pd import numpy as np import csv with open('/Users/kunalsingh/Documents/ML@Berkeley/Investarget/bing_data.csv', 'a') as data: writer = csv.writer(data) df = pd.read_csv( '/Users/kunalsingh/Documents/ML@Berkeley/Investarget/CrunchbaseMattermarkMerge.csv', usecols=[1]) names = df['Name'].values.tolist() for name in names: search_service = PyMsCognitiveNewsSearch( '7831cba4b4104e7b9d45ab6666ad3514', name) first_fifty_result = search_service.search(limit=10, format='json') descriptions = [] for item in first_fifty_result: descriptions.append(item.description) writer.writerow(descriptions)