def yandexSearch(api_user, api_key, top_k_results, prefixs, suffixs, outputFileName): """ a yandex search API. It requires a yandex account. The registeration is free and go to https://pypi.org/project/yandex-search/ for more details :param api_user: [str] your credentials username :param api_key: [str] your credentials api key :param top_k_results: [int] only keep top k results :param prefixs:[list] prefix for searches :param suffixs: [list] suffix for searches :param outputFileName: [str] output name :return: void """ output = open(outputFileName + ".txt", 'w') yandex = yandex_search.Yandex(api_user=api_user, api_key=api_key) for prefix in prefixs: for suffix in suffixs: output.write("=====" + prefix + suffix + "=====" + "\n") try: results = yandex.search(prefix + suffix).items for i in range(top_k_results): output.write(str(results[i]['url']) + "\n") except: print("quest failed")
def main(args): os.makedirs(args.out_dir, exist_ok=True) logger = setup_logger() yandex = yandex_search.Yandex(api_user=args.user, api_key=args.key) with open(args.queries_file, 'r') as queries_f: for query_i, query in enumerate(queries_f): query = query.strip() if not query: continue logger.info(f'Query {query_i}: {query}') query_res_i = 0 for page_i in range(args.get_pages): for found_item in yandex.search(query, page=page_i).items: url = found_item['url'] logger.info(f'Found item {query_res_i}: {url}') resp = requests.get(url) with open( os.path.join( args.out_dir, f'{query_i:03d}_{query_res_i:05d}.html'), 'w') as item_f: item_f.write(resp.content) query_res_i += 1
def get_data(phishtank_key, force_update=False): if not os.path.isfile("phishtank.csv") or force_update: urllib.request.urlretrieve( "http://data.phishtank.com/data/{}/online-valid.csv".format( phishtank_key), "phishtank.csv", show_progress) if not os.path.isfile("common.csv") or force_update: data = {"url": []} with open("keywordList") as wordlist: keywords = wordlist.read().split("\n") wordlist.close() suggestions = [] for word in keywords: URL = ( "http://suggestqueries.google.com/complete/search?client=firefox&q=" + word) headers = {'User-agent': 'Mozilla/5.0'} response = requests.get(URL, headers=headers) result = json.loads(response.content.decode('utf-8')) for r in result[1]: suggestions.append(r) yandex = yandex_search.Yandex( api_user='******', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c') for word in suggestions: top10 = (yandex.search(word).items[0:10]) for site in top10: data["url"].append(site) common = pd.DataFrame(data) common.to_csv("common.csv") urls = (pd.read_csv("phishtank.csv"), pd.read_csv("common.csv")) return urls
def check_url(self, url): yandex = yandex_search.Yandex( api_user='******', api_key='03.1042294429:b8e679f9acadef49ebab0d9726ccef58') data = self.get_url_data(url, yandex, timeout=10) if self.aiPredict(data): self.add_domain_to_blacklist(url) self.spam_points += self.sensitivity
def test_no_results(): @all_requests def response_credential(url, request): xml = open('tests/noresults_error.xml', 'rb').read() return {'content': xml} with HTTMock(response_credential): yandex = yd.Yandex(api_user='******', api_key='fake') with pytest.raises(yd.NoResultsException): yandex.search(query='asdf')
def test_xml_parse(): @all_requests def response_success(url, request): xml = open('tests/success.xml', 'rb').read() return {'status_code': 200, 'content': xml} with HTTMock(response_success): yandex = yd.Yandex(api_user='******', api_key='fake') results = yandex.search(query='asdf') assert results.found['strict'] == '7' assert len(results.items) == 7 for item in results.items: assert 'url' in item assert 'title' in item assert 'snippet' in item assert 'domain' in item
import yandex_search # https://yandex.ru/search/xml?user=lvv-2003&key=03.195052229:15b4cdde7ff532f1a4b3c8db5703e842 yandex = yandex_search.Yandex(api_user='******', api_key='pt598t6x') print(yandex.search("котики википедия").items)
### ANSWERS DICT LIKE THIS ### ('Мыши', 'Пчёлы', 'Мухи') # 3 ELEMENTS!!! import difflib import yandex_search import config yandex = yandex_search.Yandex(api_user=config.API_USER, api_key=config.API_KEY) replacement = {'x': 'х', 'o': 'о', 'у': 'у', 'e': 'е', 'a': 'а'} class Compare(): def __init__(self, question: str, answers: list): self.question = question.lower() self.answers = answers if not self.question or \ not self.answers: raise Exception("Question or answers is not defined") if len(answers) < 3 or len(answers) > 3: raise Exception("Answers list is not correct") for (k, v) in replacement.items(): self.question.replace(k, v) for x in answers: x.replace(k, v) def check(self): r = self.search()
import yandex_search import json api = json.load(open("../api.txt")) user = api["api_user"] key = api["api_key"] yandex = yandex_search.Yandex(api_user=user, api_key=key) def yandex_query(key_word): text = key_word results = yandex.search(text) list_results = results.items list_len = len(list_results) for i in range(list_len - 1): c = list_results[i] print(c['title'], ' - ', c['url'], '\n')
import re import os import yandex_search import pickle from urllib import error from bs4 import BeautifulSoup as BS import cfscrape import requests import shutil import data # noinspection PyBroadException mods_exception = ['VoxelMap'] yandex = yandex_search.Yandex( api_user='******', api_key='03.907013875:1908728c0c5f64a885f21721a1f1f4ee') def unzip(file_path): try: with zipfile.ZipFile(file_path, 'r') as zip_ref: zip_ref.extract('mcmod.info') except KeyError: return False return True def get_mod_info(file_name): if unzip(os.path.join(data.user_mc_path, file_name)): with open('mcmod.info', 'rb') as file:
import additional import requests from telegram.ext import Updater, MessageHandler, Filters, CallbackQueryHandler from apixu.client import ApixuClient import logging from azure.cognitiveservices.search.imagesearch import ImageSearchAPI from msrest.authentication import CognitiveServicesCredentials from newsapi import NewsApiClient logging.basicConfig(filename='main.log', format='%(asctime)s %(levelname)s %(name)s %(message)s', level=logging.DEBUG) newsapi = NewsApiClient(api_key=keys.news_api) app = apiai.ApiAI(keys.apiai) yandex = yandex_search.Yandex(api_key=keys.yandex_key, api_user=keys.yandex_user) client = ApixuClient(keys.apixu) image_search = ImageSearchAPI( credentials=CognitiveServicesCredentials(keys.visual_search_key)) session_storage = {} err = " Если у вас постоянно возникает ошибка с поиском, поиском по изображению или новостями," \ " то рекомендую вам перезапустить меня командой /start ." def get_toponym_delta(toponym): toponym_bounded_lower = tuple( toponym["boundedBy"]["Envelope"]["lowerCorner"].split(" ")) toponym_bounded_upper = tuple( toponym["boundedBy"]["Envelope"]["upperCorner"].split(" ")) return str(abs(float(toponym_bounded_lower[0]) -
def extract_data(raw_data, force_update=False): reps = 0 phishing, benign = raw_data[0], raw_data[1] data = { "phishing": [], "length": [], "out_resources": [], "dir_num": [], "special_char_num": [], "robots_entries": [], "tld_trust": [], "index_num": [], "subdomain_len": [], "subdomain_num": [], "url": [] } if not os.path.isfile("dataset.csv") or force_update: largest_dataset = 0 while os.path.isfile(largest_dataset + 300): largest_dataset += 300 try: # filter old sites old = [] for index, row in phishing.iterrows(): date = datetime.strptime(row["submission_time"], "%Y-%m-%dT%H:%M:%S+00:00") if date.year < 2020: old.append(index) phishing = phishing.drop(old) yandex = yandex_search.Yandex( api_user='******', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c') for index, row in phishing.iterrows(): reps += 1 if reps < largest_dataset: continue if reps % 300 == 0: pd.DataFrame(data).to_csv("dataset{}.csv".format(reps)) url = row['url'] print("[INFO]: {} : {}".format(reps, url)) url_data = get_url_data(url, yandex) data["phishing"].append(1) data["length"].append(url_data["length"]) data["dir_num"].append(url_data["dir_num"]) data["special_char_num"].append(url_data["special_char_num"]) data["tld_trust"].append(url_data["tld_trust"]) data["index_num"].append(url_data["index_num"]) data["subdomain_len"].append(url_data["subdomain_len"]) data["subdomain_num"].append(url_data["subdomain_num"]) data["out_resources"].append(url_data["out_resources"]) data["robots_entries"].append(url_data["robots_entries"]) data["url"].append(url_data["url"]) for index, row in benign.iterrows(): reps += 1 if reps < largest_dataset: continue if reps % 300 == 0: pd.DataFrame(data).to_csv("dataset{}.csv".format(reps)) url = row['url'] print("[INFO]: {} : {}".format(reps, url)) url_data = get_url_data(url, yandex) data["phishing"].append(1) data["length"].append(url_data["length"]) data["dir_num"].append(url_data["dir_num"]) data["special_char_num"].append(url_data["special_char_num"]) data["tld_trust"].append(url_data["tld_trust"]) data["index_num"].append(url_data["index_num"]) data["subdomain_len"].append(url_data["subdomain_len"]) data["subdomain_num"].append(url_data["subdomain_num"]) data["out_resources"].append(url_data["out_resources"]) data["robots_entries"].append(url_data["robots_entries"]) data["url"].append(url_data["url"]) pd.DataFrame(data).to_csv("dataset.csv".format(reps)) except Exception as e: print("[ERROR]: {}".format(e)) return pd.read_csv("dataset.csv")
import yandex_search import sys if len(sys.argv) >= 2: #print(sys.argv[0]) site = sys.argv[1] yandex = yandex_search.Yandex(api_user='******', api_key='mykey') print(yandex.search('site:'+site).items) else: print("algo deu errado")
# Parameters FILE_NAME = "sites" # name of input file NUM_OF_RESULTS = 5 # number of results kept for each search OUTPUT_FILE_NAME = "output" # name of output file KEYWORD = " Privacy Policy" # keyword to search; format = company_name + keyword START = 8000 # start with # of company END = 8500 # terminate when reaches # of company API_KEY = "" count = 0 file = open(FILE_NAME + ".txt", 'r') ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') output = open(OUTPUT_FILE_NAME + st + ".txt", 'w') yandex = yandex_search.Yandex(api_user='******', api_key=API_KEY) for line in file: if count == END: break count += 1 if count < START: continue if line.split()[1] == "Hidden": continue output.write("=====" + str(count) + " " + line.split()[1] + "=====" + "\n") try: results = yandex.search("'" + line.split()[1] + KEYWORD + "'").items print("Request#" + str(count) + " succeeded:" + line.split()[1]) except: