Ejemplo n.º 1
0
class TranslatePO(object):
    def __init__(self,
                 dest: str = 'en',
                 src: str = 'auto',
                 proxy_size=20,
                 th_size=0):
        self.dest = dest
        self.src = src

        self.proxies = Proxies(proxy_len=proxy_size)
        self.proxies.verify_proxies()
        self.loading_index = 0
        self.loading_len = 0
        self.th_size = th_size

    def __progress(self):
        self.loading_index += 1
        progress = self.loading_index * 100 // self.loading_len
        print('Loading: %s%%%s' % (progress, " " * 10), end='\r')

    def __translate(self, line):
        self.__progress()

        curr_proxy = self.proxies.done_proxies.get()
        tr = Translator(proxies=curr_proxy, timeout=10)

        res = None

        try:
            res = tr.translate(line.msgid, dest=self.dest, src=self.src)
        except Exception:
            curr_proxy = self.proxies.done_proxies.get()
            tr = Translator(proxies=curr_proxy, timeout=10)
            res = tr.translate(line.msgid, dest=self.dest, src=self.src)
        finally:
            self.proxies.done_proxies.put(curr_proxy)
            line.msgstr = res.text if res else ''

    def po_translate(self, file: str, out='res.po'):
        po_file = pofile(file)
        self.loading_len = len(po_file)

        if self.th_size:
            th = ThreadPool(self.th_size, self.__translate, data=po_file)
            th.start()
            th.join()
        else:
            for el in po_file:
                self.__translate(el)

        print('\nFINISH')

        with open(out, 'w') as out_file:
            for po_line in po_file:
                out_file.write('%s\n' % str(po_line))
Ejemplo n.º 2
0
    def __init__(self,
                 dest: str = 'en',
                 src: str = 'auto',
                 proxy_size=20,
                 th_size=0):
        self.dest = dest
        self.src = src

        self.proxies = Proxies(proxy_len=proxy_size)
        self.proxies.verify_proxies()
        self.loading_index = 0
        self.loading_len = 0
        self.th_size = th_size
Ejemplo n.º 3
0
def start(key, sort_type):
    reatefile(key)
    proxies = Proxies().main()
    # proxies={}
    for page_num in range(1, 15):
        try:
            url_list = search(key, sort_type, page_num, proxies)
            # print('****',url_list)
            for url in url_list:
                link(key, url, proxies)
                time.sleep(random.uniform(1, 3))
        except Exception as e:
            print('* error *', e)
            if len(url_list) == 0:
                proxies = Proxies().main()
Ejemplo n.º 4
0
    def get_source_code_submissions(proxies,
                                    proxy_index,
                                    contest_id,
                                    submission_ids,
                                    pbar,
                                    submission_index=0,
                                    iterations=1000):
        if iterations > 0:
            # Specify a random user agent
            headers = {'User-Agent': 'anything'}

            # if all proxies have been exhausted, get new list of proxies
            if proxy_index >= len(proxies):
                proxies = Proxies.get_proxies()
                proxy_index = 0

            # Set the proxy for the requests
            proxy_dict = {
                'http': f'{proxies[proxy_index]}',
                'https': f'{proxies[proxy_index]}',
            }

            try:
                # Go over all submissions and make a request to the submission page
                for submission in submission_ids[submission_index:]:
                    response = requests.get(
                        f'https://codeforces.com/contest/{contest_id}/submission/{submission}',
                        timeout=10,
                        headers=headers,
                        proxies=proxy_dict,
                        allow_redirects=False)

                    # Transform to soup object with html parser
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Find the actual program code on page
                    solution = soup.find("pre", {
                        "id": "program-source-text"
                    }).text.strip()

                    os.makedirs(f'data/contests_solutions_code/{contest_id}/',
                                exist_ok=True)

                    text_file = open(
                        f'data/contests_solutions_code/{contest_id}/{submission}.txt',
                        "w")
                    text_file.write(solution)
                    text_file.close()

                    # update where we are for if we need to update the proxy and update the progress bar
                    submission_index += 1
                    pbar.update()

            except Exception:
                proxy_index += 1
                iterations -= 1
                Scraper.get_source_code_submissions(proxies, proxy_index,
                                                    contest_id, submission_ids,
                                                    pbar, submission_index,
                                                    iterations)
Ejemplo n.º 5
0
class HttpProxyMiddleware(object):
    def __init__(self, proxy_file, proxy_bypass_percent, **kwargs):
	self.bypass_percent = int(proxy_bypass_percent)
        self.proxies = Proxies(proxy_file, **kwargs)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            crawler.settings.get(
                'PROXY_SERVER_LIST_CACHE_FILE',
                'proxies.txt'
            ),
            crawler.settings.get(
            	'PROXY_BYPASS_PERCENT',
		0
	    ),
            logger=lambda message: log.msg(message),
        )

    def process_request(self, request, spider):
	n = random.randint(0, 99)
	if n >= self.bypass_percent:
            proxy = self.proxies.get_proxy()
            log.msg('Using proxy ' + proxy, spider=spider)
            request.meta['proxy'] = 'http://' + proxy
	else:
	    if 'proxy' in request.meta:
	        del request.meta['proxy']
            log.msg('No proxy used', spider=spider)
Ejemplo n.º 6
0
def main():
    loop = asyncio.get_event_loop()
    with aiohttp.ClientSession(loop=loop) as session:
        proxies = Proxies(
            session, 30,
            'http://gimmeproxy.com/api/getProxy?protocol=http&supportsHttps=true&maxCheckPeriod=3600'
        )
        area = Area('https://www.freecycle.org/browse/UK/London')
        groups = loop.run_until_complete(
            area.get_groups(session, proxies, SEARCH_TERMS, FROM, TO))
    display(groups)
Ejemplo n.º 7
0
class HttpProxyMiddleware(object):
    def __init__(self, cache_file):
        if (cache_file is None):
                cache_file = '/tmp/__proxy_list_cache.json'

	self.proxies = Proxies(cache_file)	

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings['PROXY_SERVER_LIST_CACHE_FILE'])

    def process_request(self, request, spider):
	proxy = self.proxies.getProxy()
	print "Using proxy %s"%proxy
	request.meta['proxy'] = "http://%s" % proxy
Ejemplo n.º 8
0
class HttpProxyMiddleware(object):
    def __init__(self, cache_file):
        if (cache_file is None):
            cache_file = '/tmp/__proxy_list_cache.json'

        self.proxies = Proxies(cache_file)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings['PROXY_SERVER_LIST_CACHE_FILE'])

    def process_request(self, request, spider):
        proxy = self.proxies.getProxy()
        print "Using proxy %s" % proxy
        request.meta['proxy'] = "http://%s" % proxy
Ejemplo n.º 9
0
class Requests:
    def __init__(self):
        self.proxies = Proxies()

    def request(self, method, url, **kwargs):
        exception = None
        TRIES = 10
        for i in range(TRIES):
            try:
                with self.proxies.borrow() as proxy:
                    kwargs['proxies'] = dict(http='http://%s:%s' % (proxy.host, proxy.port),
                                             https='https://%s:%s' % (proxy.host, proxy.port))
                    kwargs['timeout'] = (10, 20)
                    response = requests.request(method=method, url=url, **kwargs)
                    if response.headers['content-type'] == 'application/json':
                        response.json()  # avoiding incorrect json even if 200
                    return response
            except Exception as e:
                log.warning('Got exception %s try(%s/%s)' % (e, i + 1, TRIES))
                exception = e
        raise exception

    def get(self, url, params=None, **kwargs):
        kwargs.setdefault('allow_redirects', True)
        return self.request('get', url, params=params, **kwargs)

    def options(self, url, **kwargs):
        kwargs.setdefault('allow_redirects', True)
        return self.request('options', url, **kwargs)

    def head(self, url, **kwargs):
        kwargs.setdefault('allow_redirects', False)
        return self.request('head', url, **kwargs)

    def post(self, url, data=None, json=None, **kwargs):
        return self.request('post', url, data=data, json=json, **kwargs)

    def put(self, url, data=None, **kwargs):
        return self.request('put', url, data=data, **kwargs)

    def patch(self, url, data=None, **kwargs):
        return self.request('patch', url, data=data, **kwargs)

    def delete(self, url, **kwargs):
        return self.request('delete', url, **kwargs)
Ejemplo n.º 10
0
def baiduzhidaosearch(keyword, page):
    ret = {
        'code': 1002,
        'msg': 'failure',
        'data': []
    }
    try:
        page = int(page) * 10
        print 111
        keyword_u = keyword.encode('utf-8')
        print chardet.detect(keyword_u)
        # url = 'http://zhidao.baidu.com/search?word=%s&ie=gbk&site=-1&sites=0&date=0&pn=%s' % (keyword.encode('utf-8 ').decode('gbk','ignore'), page)
        url = u'http://zhidao.baidu.com/search?ct=17&pn=%s&tn=ikaslist&rn=10&word=%s' % (page, keyword)
        #print(url)
        print 222
        headers = Headers.getHeaders()
        proxies = Proxies.get_proxies()
        req = requests.get(url, headers=headers, timeout=60, proxies=proxies)
        if req.status_code == 200:
            ret['code'] = 1001
            ret['msg'] = 'success'
            id = []
            title = []
            req.encoding = 'gbk'
            html = req.text.encode(encoding="utf-8", errors="ignore").decode("utf-8", errors="ignore")
            selector = etree.HTML(html)
            urls = selector.xpath('//div[@class="list"]/dl/dt[1]/a/@href')
            for u in urls:
                match_obj = re.search(r'question/(.*?).html', u, re.M | re.I)
                id.append(match_obj.group(1))
            titles = selector.xpath('//div[@class="list"]/dl/dt[1]/a')
            for t in titles:
                title.append(etree.tostring(t, encoding='utf8', method="html"))
            max_n = len(id)
            n = 0
            while True:
                if n >= max_n:
                    break
                # print(title[n])
                ret['data'].append(
                        {'cid': id[n], 'title': re.search(r'"ti">(.*?)</a>', title[n], re.M | re.I).group(1)})
                n = n + 1
    except Exception as e :
        print e
    return simplejson.dumps(ret)
Ejemplo n.º 11
0
def main():
    if len(sys.argv) < 2:
        print("Usage: " + sys.argv[0] + " <board>")
        return

    board = sys.argv[1]

    posts = {}
    try:
        posts = json.loads(open("data/posts_%s.json" % board).read())
    except Exception as e:
        pass

    regexps_like = [
        regex.split("\n")[0]
        for regex in open("data/regexps_like").readlines()
    ]
    regexps_dislike = [
        regex.split("\n")[0]
        for regex in open("data/regexps_dislike").readlines()
    ]
    comparator_dislike = Comparator(
        open("data/comparator.wasm", "rb").read(), [
            base64.b64decode(image.split("\n")[0])
            for image in open("data/images").readlines()
        ])
    checker = Checker(regexps_like, regexps_dislike, comparator_dislike)

    proxies = Proxies(
        [proxy.split("\n")[0] for proxy in open("data/proxies").readlines()])
    network = Network(proxies, 10)

    liker = Liker(board, checker, posts, network)

    network.start()
    network.join()
Ejemplo n.º 12
0
    def __init__(self, proxy_file, proxy_bypass_percent, **kwargs):
	self.bypass_percent = int(proxy_bypass_percent)
        self.proxies = Proxies(proxy_file, **kwargs)
Ejemplo n.º 13
0
 def __init__(self):
     self.proxies = Proxies()
Ejemplo n.º 14
0
from headers import Headers
from config import Config
from mysqldao import MysqlDao
import time
from proxies import Proxies

url_main = 'http://www.bttiantang.com/movie.php?/order/id/'

page_count = 10
# 1367为总页数,需要定期修改
for one in xrange(1, page_count):
    url = url_main + str(one)
    print(url)
    headers = Headers.getHeaders()
    try:
        proxies = Proxies.get_proxies()
        req = requests.get(url, headers=headers, timeout=30, proxies=proxies)
        code = req.status_code
        print(code)
        if code == 200:
            html = req.content
            selector = etree.HTML(html)
            content_urls = selector.xpath('//*[contains(@class,"litpic")]/a[1]/@href')
            for content_url in content_urls:
                content_url = Config.url_main + content_url
                created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                insert_value = '"' + content_url + '",0,"' + created_at + '"'
                mysqlDao = MysqlDao()
                sql = 'select url from bttiantang_url ORDER BY id desc limit 0,1 '
                ret = mysqlDao.execute(sql)
                for r in ret:
Ejemplo n.º 15
0
#!/usr/bin/env python3

""""""
from logger import Logger
from proxies import Proxies
from scraper import Scraper
from spoofing import Headers


if __name__ == "__main__":
    headers = Headers()
    logger = Logger()
    proxies = Proxies(headers, logger)
    scraper = Scraper(headers, proxies, logger)
    scraper.scrape()
Ejemplo n.º 16
0
    def __init__(self, cache_file):
        if (cache_file is None):
            cache_file = '/tmp/__proxy_list_cache.json'

        self.proxies = Proxies(cache_file)
Ejemplo n.º 17
0
    run = True
    task = 0

    while run:
        task_n = int(show_task())
        try:
            task = task_n
        except ValueError as e:
            print('请输入正确的序号')
            continue
        if task == 0:
            run = False
        elif task == 1:
            create_db()
        elif task == 2:
            Proxies().run()
        elif task == 3:
            while os.path.exists(
                    os.path.abspath(os.path.dirname(__file__)) + '\data'):
                shutil.rmtree(
                    os.path.abspath(os.path.dirname(__file__)) + '\data')
            WeChatSogou().run_hy()
        elif task == 4:
            names = input('''
            ** 请输入要爬取的公众号:(可输入多个)
            ''').strip('"')
            names = [name for name in names.split(',')]
            proxy = input('''
            ** 选择手动输入代理: (ip:port(目前只接收一个)|自动请回车)
            ''')
            WeChatSogou().run_gzh(names=names, iproxy=proxy)
Ejemplo n.º 18
0
import datetime
import time

from combiner import CSVCombiner
from downloader import CanWeatherDataDownloader
from proxies import Proxies

st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H%M%S')

dldr = CanWeatherDataDownloader()

dldr.set_download_dir = './download/' + st + '/'
dldr.set_proxies(Proxies.get_random_proxy())
dldr.all_station_id = dldr.read_station_id_from_file('./station_id.txt')
dldr.download_daily_data(1990, 2020, thread=20)

csv_combiner = CSVCombiner(dldr.download_dir, 'weather_data.csv').run()
Ejemplo n.º 19
0
    def __init__(self, cache_file):
        if (cache_file is None):
                cache_file = '/tmp/__proxy_list_cache.json'

	self.proxies = Proxies(cache_file)	
Ejemplo n.º 20
0
def get_solutions(amt_threads=40):
    # Get all contests that have already been scraped
    scraped_contests = []

    for _, dirs, _ in os.walk('data/contests_solutions_code'):
        scraped_contests += [int(dir) for dir in dirs]

    # Get the metadata files to find submission ids to scrape solutions for
    metadata_files = [
        file for file in os.listdir('data/contests_solutions_metadata')
        if os.path.isfile(f'data/contests_solutions_metadata/{file}')
    ]

    if len(metadata_files) == 0:
        print(
            'Run "get_solutions_metadata" first to fetch the solutions to scrape before running this function'
        )

    for metadata_file in metadata_files:
        # Open the solutions metadata file where solutions will be coupled to
        solutions_df = pd.read_csv(
            f'data/contests_solutions_metadata/{metadata_file}')

        # Get all unique contests in this dataset that are not scraped yet
        contests = list(
            solutions_df[~solutions_df['contestId'].isin(scraped_contests)]
            ['contestId'].unique())

        for contest in contests:
            print(f'starting with contest {contest}...')
            contest_submissions = solutions_df[solutions_df['contestId'] ==
                                               contest]['solutionId']

            # Work with amt of threads to parallelize the requests
            threads = []

            submissions_per_thread = math.ceil(
                len(contest_submissions) / amt_threads)

            proxies = Proxies.get_proxies()

            # Progress bar to indicate current progress and speed
            pbar = tqdm(total=len(contest_submissions))

            for index in range(0, len(contest_submissions),
                               submissions_per_thread):
                # Let every thread start with a random proxy to spread the search space
                proxy_index = random.randrange(0, len(proxies))

                # Get solutions for the contests from the scraper
                threads.append(
                    threading.Thread(
                        target=Scraper.get_source_code_submissions,
                        args=(
                            proxies,
                            proxy_index,
                            contest,
                            contest_submissions[index:index +
                                                submissions_per_thread],
                            pbar,
                        )))

            for t in threads:
                t.start()

            for t in threads:
                t.join()