Esempio n. 1
0
def parse_wipe(table):
    """
    Wipes the table
    """
    datafile = Datafile(filename="test")
    print("Creating new datafile (and overwriting any existing files!)")
    datafile.create_new()
    table.refresh(datafile)
    print("Datafile created")
    return True
Esempio n. 2
0
 def run(self):
     """工作线程"""
     while Datafile.d.qsize() < 10000:
         if not self.Urlqueue.empty():
             url = self.Urlqueue.get()
             data = self.Crawl.crawl(url)
             print(data)
             print(self.Crawl.proxy)
             if data:
                 self.count['sucess_count'] += 1
                 Datafile.dumps(data)
             else:
                 self.count['failed_count'] += 1
                 self.Urlqueue.put(url)
         #Datafile.dumps(data)
         self.log()
Esempio n. 3
0
 def get_datafile_handle(self, pair_id):
     """ Get a handle to the data file, opened for binary writing. """
     src_dst = self.get_src_dst_by_id(pair_id)
     datafile = Datafile.open_datafile(src_dst['id'],
                                       src_dst['binary_file'],
                                       max_records=src_dst['max_records'])
     return datafile
Esempio n. 4
0
def sort_records_into_new_datafile(records, new_path):
    new_datafile = Datafile.create_new_datafile(-1, new_path)
    new_records = records[:]
    new_records.sort(key=operator.itemgetter(0))
    start = time.time()
    new_datafile.write_all_records(new_records)
    stop = time.time()
    logging.info("Wrote %i records to %s", len(records), new_path)
    logging.debug("Took %0.2f seconds to write all records", stop - start)
Esempio n. 5
0
def new_datafile(args):
    new_datafile = Datafile.create_new_datafile(-1, args.output,
                                                version=args.version,
                                                data_length=args.data_length,
                                                offset=args.offset,
                                                number_of_records=args.num_records)
    new_datafile.file.seek(new_datafile.header_length)
    for i in range(args.num_records):
        new_datafile.file.write(bytes(new_datafile.record_length))
    logging.info("Wrote new datafile: %s with %i null records",
                 args.output, args.num_records)
    show_datafile_info(new_datafile)
Esempio n. 6
0
    def start(self):
        """运行"""
        if Datafile.is_exit():  #断点续传
            link = Datafile.open_csv()
            for t in link:
                self.Urlqueue.put(t[0])
        else:
            boot_threading = threading.Thread(target=self.send)
            boot_threading.start()
            boot_threading.join()

            for i in range(self.count['count']):
                t = threading.Thread(target=self.recv, )
                t.start()
                t.join()
            if self.count['failed_count'] != 0:
                for i in range(self.count['failed_count']):
                    t = threading.Thread(target=self.recv, )
                    t.start()
                    t.join()
        self.count['count'] = self.Urlqueue.qsize()
        self.count['failed_count'] = 0

        thread_list = []
        for s in range(thread_count):
            workerthread = threading.Thread(target=self.run, )
            thread_list.append(workerthread)
        for t in thread_list:
            t.start()
            t.join()
        Datafile.save('data')
        while not self.Urlqueue.empty():
            sa = [self.Urlqueue.get()]
            Datafile.dumps(sa)
        Datafile.save('rest')
        print('*******************')
        self.log()
Esempio n. 7
0
 def get_or_make_datafile(self, src_ip, dst_ip):
     if (src_ip, dst_ip) in self.datafiles:
         return self.datafiles[(src_ip, dst_ip)]
     # check if pair is in DB already and return it
     binary_pair = self.databaseMysql.get_binary_src_dst_by_pair(
         src_ip, dst_ip)
     if binary_pair:
         datafile = self.get_datafile_handle(binary_pair['id'])
     # otherwise, make a new pair in the DB
     else:
         directory = self.db_params['binary_data_directory']
         filename = src_ip + '_' + dst_ip + '_' + str(int(time.time())) \
             + '.ping'
         binary_file = os.path.abspath(os.path.join(directory, filename))
         pair_id = self.databaseMysql.make_binary_src_dst_pair(
             src_ip, dst_ip, binary_file,
             DatabaseBinary.DEFAULT_MAX_RECORDS)
         datafile = Datafile.create_new_datafile(
             pair_id,
             binary_file,
             max_records=DatabaseBinary.DEFAULT_MAX_RECORDS)
     self.datafiles[(src_ip, dst_ip)] = datafile
     return datafile
Esempio n. 8
0
        return True
    except ValueError:
        print('Error: Expected code \"%s\" to be an integer' % values[0])
        return False


def parse_btree(table):
    if table.btree is None:
        print('BTree not created yet')
        return None

    table.btree.pretty_print()


if __name__ == "__main__":
    datafile = Datafile(filename="test")
    #datafile.create_new()
    table = Table.init(datafile)

    print('SGBD started')
    finish = False
    while not finish:
        cmd = input('$ ')

        if cmd == 'exit':
            finish = True
            print('Closing SGBD...')
            table.exit()
        else:
            parse_input(cmd, table)
Esempio n. 9
0
def open_datafile(path):
    datafile = Datafile.open_datafile(-1, path)
    return datafile
Esempio n. 10
0
"""
Description:
- 多线程,队列操作,断点续传

author:https://github.com/HANKAIluo
2018.3.18
"""

import threading
from queue import Queue
from crawl import Crawl
from datafile import Datafile
from getproxy import get_proxy
import time

Datafile = Datafile()

proxies = get_proxy()

thread_count = 40


class MasterThread:
    def __init__(self):
        self.count = {
            'count': 0,  #爬取总数
            'failed_count': 0,  #爬取失败总数
            'sucess_count': 0,  #成功爬取总数
            'start_time': time.asctime(),  #开始时间
            'end_time': 0,  #结束时间
        }
Esempio n. 11
0
File: main.py Progetto: edg-l/libtw2
def do(filename):
	try:
		with Datafile(filename) as df:
			pass
	except DatafileError as dfe:
		print("{}: {}".format(filename, dfe))