コード例 #1
0
ファイル: main.py プロジェクト: haonanc/general-scraper
def yandexSearch(api_user, api_key, top_k_results, prefixs, suffixs,
                 outputFileName):
    """
    a yandex search API. It requires a yandex account. The registeration is free and go to
    https://pypi.org/project/yandex-search/ for more details

    :param api_user: [str] your credentials username
    :param api_key: [str] your credentials api key
    :param top_k_results: [int] only keep top k results
    :param prefixs:[list] prefix for searches
    :param suffixs: [list] suffix for searches
    :param outputFileName: [str] output name
    :return: void
    """
    output = open(outputFileName + ".txt", 'w')
    yandex = yandex_search.Yandex(api_user=api_user, api_key=api_key)
    for prefix in prefixs:
        for suffix in suffixs:
            output.write("=====" + prefix + suffix + "=====" + "\n")
            try:
                results = yandex.search(prefix + suffix).items
                for i in range(top_k_results):
                    output.write(str(results[i]['url']) + "\n")
            except:
                print("quest failed")
コード例 #2
0
def main(args):
    os.makedirs(args.out_dir, exist_ok=True)

    logger = setup_logger()

    yandex = yandex_search.Yandex(api_user=args.user, api_key=args.key)
    with open(args.queries_file, 'r') as queries_f:
        for query_i, query in enumerate(queries_f):
            query = query.strip()
            if not query:
                continue

            logger.info(f'Query {query_i}: {query}')
            query_res_i = 0
            for page_i in range(args.get_pages):
                for found_item in yandex.search(query, page=page_i).items:
                    url = found_item['url']
                    logger.info(f'Found item {query_res_i}: {url}')

                    resp = requests.get(url)
                    with open(
                            os.path.join(
                                args.out_dir,
                                f'{query_i:03d}_{query_res_i:05d}.html'),
                            'w') as item_f:
                        item_f.write(resp.content)

                    query_res_i += 1
コード例 #3
0
def get_data(phishtank_key, force_update=False):
    if not os.path.isfile("phishtank.csv") or force_update:
        urllib.request.urlretrieve(
            "http://data.phishtank.com/data/{}/online-valid.csv".format(
                phishtank_key), "phishtank.csv", show_progress)
    if not os.path.isfile("common.csv") or force_update:
        data = {"url": []}
        with open("keywordList") as wordlist:
            keywords = wordlist.read().split("\n")
            wordlist.close()
        suggestions = []
        for word in keywords:
            URL = (
                "http://suggestqueries.google.com/complete/search?client=firefox&q="
                + word)
            headers = {'User-agent': 'Mozilla/5.0'}
            response = requests.get(URL, headers=headers)
            result = json.loads(response.content.decode('utf-8'))
            for r in result[1]:
                suggestions.append(r)
        yandex = yandex_search.Yandex(
            api_user='******',
            api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
        for word in suggestions:
            top10 = (yandex.search(word).items[0:10])
            for site in top10:
                data["url"].append(site)
        common = pd.DataFrame(data)
        common.to_csv("common.csv")
    urls = (pd.read_csv("phishtank.csv"), pd.read_csv("common.csv"))
    return urls
コード例 #4
0
ファイル: mail.py プロジェクト: theFr1nge/KulYutmaz
 def check_url(self, url):
     yandex = yandex_search.Yandex(
         api_user='******',
         api_key='03.1042294429:b8e679f9acadef49ebab0d9726ccef58')
     data = self.get_url_data(url, yandex, timeout=10)
     if self.aiPredict(data):
         self.add_domain_to_blacklist(url)
         self.spam_points += self.sensitivity
コード例 #5
0
def test_no_results():
    @all_requests
    def response_credential(url, request):
        xml = open('tests/noresults_error.xml', 'rb').read()
        return {'content': xml}

    with HTTMock(response_credential):
        yandex = yd.Yandex(api_user='******', api_key='fake')
        with pytest.raises(yd.NoResultsException):
            yandex.search(query='asdf')
コード例 #6
0
def test_xml_parse():
    @all_requests
    def response_success(url, request):
        xml = open('tests/success.xml', 'rb').read()
        return {'status_code': 200,
                'content': xml}

    with HTTMock(response_success):
        yandex = yd.Yandex(api_user='******', api_key='fake')
        results = yandex.search(query='asdf')
        assert results.found['strict'] == '7'
        assert len(results.items) == 7
        for item in results.items:
            assert 'url' in item
            assert 'title' in item
            assert 'snippet' in item
            assert 'domain' in item
コード例 #7
0
import yandex_search


# https://yandex.ru/search/xml?user=lvv-2003&key=03.195052229:15b4cdde7ff532f1a4b3c8db5703e842

yandex = yandex_search.Yandex(api_user='******', api_key='pt598t6x')
print(yandex.search("котики википедия").items)
コード例 #8
0
### ANSWERS DICT LIKE THIS
### ('Мыши', 'Пчёлы', 'Мухи') # 3 ELEMENTS!!!
import difflib
import yandex_search
import config

yandex = yandex_search.Yandex(api_user=config.API_USER, api_key=config.API_KEY)

replacement = {'x': 'х', 'o': 'о', 'у': 'у', 'e': 'е', 'a': 'а'}


class Compare():
    def __init__(self, question: str, answers: list):
        self.question = question.lower()
        self.answers = answers

        if not self.question or \
           not self.answers:
            raise Exception("Question or answers is not defined")

        if len(answers) < 3 or len(answers) > 3:
            raise Exception("Answers list is not correct")

        for (k, v) in replacement.items():
            self.question.replace(k, v)

            for x in answers:
                x.replace(k, v)

    def check(self):
        r = self.search()
コード例 #9
0
import yandex_search
import json
api = json.load(open("../api.txt"))
user = api["api_user"]
key = api["api_key"]
yandex = yandex_search.Yandex(api_user=user, api_key=key)


def yandex_query(key_word):
    text = key_word
    results = yandex.search(text)
    list_results = results.items
    list_len = len(list_results)
    for i in range(list_len - 1):
        c = list_results[i]
        print(c['title'], ' - ', c['url'], '\n')
コード例 #10
0
ファイル: script.py プロジェクト: TimNekk/mc-mod-updater
import re
import os
import yandex_search
import pickle
from urllib import error
from bs4 import BeautifulSoup as BS
import cfscrape
import requests
import shutil
import data

# noinspection PyBroadException

mods_exception = ['VoxelMap']
yandex = yandex_search.Yandex(
    api_user='******',
    api_key='03.907013875:1908728c0c5f64a885f21721a1f1f4ee')


def unzip(file_path):
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extract('mcmod.info')
    except KeyError:
        return False
    return True


def get_mod_info(file_name):
    if unzip(os.path.join(data.user_mc_path, file_name)):
        with open('mcmod.info', 'rb') as file:
コード例 #11
0
import additional
import requests
from telegram.ext import Updater, MessageHandler, Filters, CallbackQueryHandler
from apixu.client import ApixuClient
import logging
from azure.cognitiveservices.search.imagesearch import ImageSearchAPI
from msrest.authentication import CognitiveServicesCredentials
from newsapi import NewsApiClient

logging.basicConfig(filename='main.log',
                    format='%(asctime)s %(levelname)s %(name)s %(message)s',
                    level=logging.DEBUG)

newsapi = NewsApiClient(api_key=keys.news_api)
app = apiai.ApiAI(keys.apiai)
yandex = yandex_search.Yandex(api_key=keys.yandex_key,
                              api_user=keys.yandex_user)
client = ApixuClient(keys.apixu)
image_search = ImageSearchAPI(
    credentials=CognitiveServicesCredentials(keys.visual_search_key))

session_storage = {}
err = " Если у вас постоянно возникает ошибка с поиском, поиском по изображению или новостями," \
      " то рекомендую вам перезапустить меня командой /start ."


def get_toponym_delta(toponym):
    toponym_bounded_lower = tuple(
        toponym["boundedBy"]["Envelope"]["lowerCorner"].split(" "))
    toponym_bounded_upper = tuple(
        toponym["boundedBy"]["Envelope"]["upperCorner"].split(" "))
    return str(abs(float(toponym_bounded_lower[0]) -
コード例 #12
0
def extract_data(raw_data, force_update=False):
    reps = 0
    phishing, benign = raw_data[0], raw_data[1]
    data = {
        "phishing": [],
        "length": [],
        "out_resources": [],
        "dir_num": [],
        "special_char_num": [],
        "robots_entries": [],
        "tld_trust": [],
        "index_num": [],
        "subdomain_len": [],
        "subdomain_num": [],
        "url": []
    }
    if not os.path.isfile("dataset.csv") or force_update:
        largest_dataset = 0
        while os.path.isfile(largest_dataset + 300):
            largest_dataset += 300
        try:
            # filter old sites
            old = []
            for index, row in phishing.iterrows():
                date = datetime.strptime(row["submission_time"],
                                         "%Y-%m-%dT%H:%M:%S+00:00")
                if date.year < 2020:
                    old.append(index)
            phishing = phishing.drop(old)
            yandex = yandex_search.Yandex(
                api_user='******',
                api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
            for index, row in phishing.iterrows():
                reps += 1
                if reps < largest_dataset:
                    continue
                if reps % 300 == 0:
                    pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
                url = row['url']
                print("[INFO]: {} : {}".format(reps, url))
                url_data = get_url_data(url, yandex)
                data["phishing"].append(1)
                data["length"].append(url_data["length"])
                data["dir_num"].append(url_data["dir_num"])
                data["special_char_num"].append(url_data["special_char_num"])
                data["tld_trust"].append(url_data["tld_trust"])
                data["index_num"].append(url_data["index_num"])
                data["subdomain_len"].append(url_data["subdomain_len"])
                data["subdomain_num"].append(url_data["subdomain_num"])
                data["out_resources"].append(url_data["out_resources"])
                data["robots_entries"].append(url_data["robots_entries"])
                data["url"].append(url_data["url"])
            for index, row in benign.iterrows():
                reps += 1
                if reps < largest_dataset:
                    continue
                if reps % 300 == 0:
                    pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
                url = row['url']
                print("[INFO]: {} : {}".format(reps, url))
                url_data = get_url_data(url, yandex)
                data["phishing"].append(1)
                data["length"].append(url_data["length"])
                data["dir_num"].append(url_data["dir_num"])
                data["special_char_num"].append(url_data["special_char_num"])
                data["tld_trust"].append(url_data["tld_trust"])
                data["index_num"].append(url_data["index_num"])
                data["subdomain_len"].append(url_data["subdomain_len"])
                data["subdomain_num"].append(url_data["subdomain_num"])
                data["out_resources"].append(url_data["out_resources"])
                data["robots_entries"].append(url_data["robots_entries"])
                data["url"].append(url_data["url"])
            pd.DataFrame(data).to_csv("dataset.csv".format(reps))
        except Exception as e:
            print("[ERROR]: {}".format(e))
    return pd.read_csv("dataset.csv")
コード例 #13
0
import yandex_search
import sys

if len(sys.argv) >= 2:
    #print(sys.argv[0])
    site = sys.argv[1]
    yandex = yandex_search.Yandex(api_user='******', api_key='mykey')
    print(yandex.search('site:'+site).items)
else:
    print("algo deu errado")
コード例 #14
0
# Parameters
FILE_NAME = "sites"  # name of input file
NUM_OF_RESULTS = 5  # number of results kept for each search
OUTPUT_FILE_NAME = "output"  # name of output file
KEYWORD = " Privacy Policy"  # keyword to search; format = company_name + keyword
START = 8000  # start with # of company
END = 8500  # terminate when reaches # of company
API_KEY = ""

count = 0
file = open(FILE_NAME + ".txt", 'r')
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
output = open(OUTPUT_FILE_NAME + st + ".txt", 'w')
yandex = yandex_search.Yandex(api_user='******', api_key=API_KEY)

for line in file:

    if count == END:
        break
    count += 1
    if count < START:
        continue
    if line.split()[1] == "Hidden":
        continue
    output.write("=====" + str(count) + " " + line.split()[1] + "=====" + "\n")
    try:
        results = yandex.search("'" + line.split()[1] + KEYWORD + "'").items
        print("Request#" + str(count) + " succeeded:" + line.split()[1])
    except: