Beispiel #1
0
class SpellChecker():
    def __init__(self):
        self.dictionary = BloomFilter()
        word_list = open('/usr/share/dict/words', 'r')
        for word in word_list:
            self.dictionary.add(word.strip())

    def valid(self, string):
        return self.dictionary.includes(string)
 def testDumpAndLoadBase64BloomFilter(self):
     bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE)        
     for key in self.all_keys:
         bloom_filter.add(key);           
     
     dump_str = bloom_filter.dump_to_base64_str(gzipped=True);                
     bloom_filter2 = BloomFilter.load_from_base64_str(dump_str);
     
     self.assertEqual(bloom_filter, bloom_filter2)                
     self.check_contains(bloom_filter2)
 def testDumpGzippedAndLoadBloomFilter(self):
     bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE)        
     for key in self.all_keys:
         bloom_filter.add(key)           
     
     dump_bytes = bloom_filter.dump(gzipped=True);                
     bloom_filter2 = BloomFilter.load(dump_bytes)
     
     self.assertEqual(bloom_filter, bloom_filter2)                
     self.check_contains(bloom_filter2)
Beispiel #4
0
class WordLookup:
    def __init__(self, file_name):
        self.bf = BloomFilter(10000000, 8)
        input_file = open(file_name, "r")
        for file_line in input_file:
            file_line = file_line.rstrip()
            self.bf.add(file_line)
        input_file.close()

    def is_qualified(self, string):
        str_len = len(string)
        if str_len != 6:
            return False
        for i in range(1, str_len - 1):
            first = string[:i]
            second = string[i:]
            if self.bf.lookup(first) and self.bf.lookup(second):
                #print first + '+' + second + '=>' + string
                return True
        return False
Beispiel #5
0
def main():
    inital_page = "http://yue.ifeng.com"

    url_queue = Queue.Queue()
    filter = BloomFilter()
    filter.add(inital_page)
    url_queue.put(inital_page)

    while True:
        urls = []
        current_url = url_queue.get()  # 取队列第一个元素
        try:
            store(current_url)
            urls = extract_urls(current_url)  # 抽取页面中的链接
        except Exception, e:
            print "Error extract_urls"
            print e
        for next_url in urls:
            if filter.notcontains(next_url):
                filter.add(next_url)
                url_queue.put(next_url)
Beispiel #6
0
class TxPool(BaseService):
    """
    The :class:`~trinity.tx_pool.pool.TxPool` class is responsible for holding and relaying
    of transactions, represented as :class:`~eth.rlp.transactions.BaseTransaction` among the
    connected peers.

      .. note::

        This is a minimal viable implementation that only relays transactions but doesn't actually
        hold on to them yet. It's still missing many features of a grown up transaction pool.
    """

    def __init__(self,
                 event_bus: EndpointAPI,
                 peer_pool: ETHProxyPeerPool,
                 tx_validation_fn: Callable[[BaseTransactionFields], bool],
                 token: CancelToken = None) -> None:
        super().__init__(token)
        self._event_bus = event_bus
        self._peer_pool = peer_pool

        if tx_validation_fn is None:
            raise ValueError('Must pass a tx validation function')

        self.tx_validation_fn = tx_validation_fn
        # 1m should give us 9000 blocks before that filter becomes less reliable
        # It should take up about 1mb of memory
        self._bloom = BloomFilter(max_elements=1000000)
        self._bloom_salt = str(uuid.uuid4())

    # This is a rather arbitrary value, but when the sync is operating normally we never see
    # the msg queue grow past a few hundred items, so this should be a reasonable limit for
    # now.
    msg_queue_maxsize: int = 2000

    async def _run(self) -> None:
        self.logger.info("Running Tx Pool")

        async for event in self.wait_iter(self._event_bus.stream(TransactionsEvent)):
            txs = cast(List[BaseTransactionFields], event.msg)
            await self._handle_tx(event.remote, txs)

    async def _handle_tx(self, sender: NodeAPI, txs: List[BaseTransactionFields]) -> None:

        self.logger.debug('Received %d transactions from %s', len(txs), sender)

        self._add_txs_to_bloom(sender, txs)

        for receiving_peer in await self._peer_pool.get_peers():

            if receiving_peer.remote is sender:
                continue

            filtered_tx = self._filter_tx_for_peer(receiving_peer, txs)
            if len(filtered_tx) == 0:
                continue

            self.logger.debug2(
                'Sending %d transactions to %s',
                len(filtered_tx),
                receiving_peer,
            )
            receiving_peer.sub_proto.send_transactions(filtered_tx)
            self._add_txs_to_bloom(receiving_peer.remote, filtered_tx)

    def _filter_tx_for_peer(
            self,
            peer: ETHProxyPeer,
            txs: List[BaseTransactionFields]) -> List[BaseTransactionFields]:

        return [
            val for val in txs
            if self._construct_bloom_entry(peer.remote, val) not in self._bloom
            # TODO: we need to keep track of invalid txs and eventually blacklist nodes
            if self.tx_validation_fn(val)
        ]

    def _construct_bloom_entry(self, remote: NodeAPI, tx: BaseTransactionFields) -> bytes:
        return f"{repr(remote)}-{tx.hash}-{self._bloom_salt}".encode()

    def _add_txs_to_bloom(self, remote: NodeAPI, txs: Iterable[BaseTransactionFields]) -> None:
        for val in txs:
            self._bloom.add(self._construct_bloom_entry(remote, val))

    async def do_cleanup(self) -> None:
        self.logger.info("Stopping Tx Pool...")
Beispiel #7
0
class TxPool(BaseService, PeerPoolSubscriber):
    """
    The :class:`~trinity.tx_pool.pool.TxPool` class is responsible for holding and relaying
    of transactions, represented as :class:`~evm.rlp.transactions.BaseTransaction` among the
    connected peers.

      .. note::

        This is a minimal viable implementation that only relays transactions but doesn't actually
        hold on to them yet. It's still missing many features of a grown up transaction pool.
    """
    logger = logging.getLogger("trinity.tx_pool.TxPool")

    def __init__(self, peer_pool: PeerPool) -> None:
        super().__init__()
        self._peer_pool = peer_pool
        # 1m should give us 9000 blocks before that filter becomes less reliable
        # It should take up about 1mb of memory
        self._bloom = BloomFilter(max_elements=1000000)
        self._bloom_salt = str(uuid.uuid4())

    def register_peer(self, peer: BasePeer) -> None:
        pass

    async def _run(self) -> None:
        self.logger.info("Running Tx Pool")

        with self.subscribe(self._peer_pool):
            while True:
                peer: ETHPeer
                peer, cmd, msg = await self.wait(self.msg_queue.get(),
                                                 token=self.cancel_token)

                if isinstance(cmd, Transactions):
                    await self._handle_tx(peer, msg)

    async def _handle_tx(self, peer: ETHPeer,
                         txs: List[BaseTransactionFields]) -> None:

        self.logger.debug('Received transactions from %r: %r', peer, txs)

        self._add_txs_to_bloom(peer, txs)

        for receiving_peer in self._peer_pool.peers:
            receiving_peer = cast(ETHPeer, receiving_peer)

            if receiving_peer is peer:
                continue

            filtered_tx = self._filter_tx_for_peer(receiving_peer, txs)
            if len(filtered_tx) == 0:
                continue

            self.logger.debug('Sending transactions to %r: %r', receiving_peer,
                              filtered_tx)
            receiving_peer.sub_proto.send_transactions(filtered_tx)
            self._add_txs_to_bloom(receiving_peer, filtered_tx)

    def _filter_tx_for_peer(
            self, peer: BasePeer,
            txs: List[BaseTransactionFields]) -> List[BaseTransactionFields]:

        return [
            val for val in txs
            if self._construct_bloom_entry(peer, val) not in self._bloom
        ]

    def _construct_bloom_entry(self, peer: BasePeer,
                               tx: BaseTransactionFields) -> bytes:
        return "{!r}-{}-{}".format(peer.remote, tx.hash,
                                   self._bloom_salt).encode()

    def _add_txs_to_bloom(self, peer: BasePeer,
                          txs: Iterable[BaseTransactionFields]) -> None:
        for val in txs:
            self._bloom.add(self._construct_bloom_entry(peer, val))

    async def _cleanup(self) -> None:
        self.logger.info("Stopping Tx Pool...")
Beispiel #8
0
from bloom_filter import BloomFilter

tree = BloomFilter(max_elements=2**16, error_rate=0.1)
tree.add("nihao")
tree.add("sleep")
print(tree)
btree = {}
btree["a.fn"] = tree
print(btree)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(description='Markov modeling and generation of DNS names')
    parser.add_argument('-n', metavar='COUNT', type=int, help='Number of names to generate')
    parser.add_argument('INPUT_FILE', help='input list of observed DNS names')
    args = parser.parse_args()

    count = args.n
    input_file_name = args.INPUT_FILE
    observed = BloomFilter(count * 50, 0.0001)

    suffix_map = {}
    suffix_freq = {}
    suffix_models = {}

    def preprocess(dns_name):
        return dns_name.strip().lower()

    # TODO: Should we add an epsilon to this to try and generate new suffix?
    def generate_val(dict_freq):
        rnd = random.random()
        gen_total = 0.0
        for k, v in dict_freq.items():
            gen_total = gen_total + v
            if rnd < gen_total:
                return k
        assert False

    with open(input_file_name, 'r') as input_file:
        dns_names = list(map(preprocess, input_file.readlines()))
        num_names = len(dns_names)

        for name in dns_names:
            parts = name.split('.')[1:]
            suffix = '.'.join(parts)
            t = tldextract.extract(suffix)
            if t.domain == '':
                num_names -= 1
                continue
            if suffix not in suffix_map:
                suffix_map[suffix] = [name]
                suffix_freq[suffix] = 1
            else:
                suffix_freq[suffix] += 1
                suffix_map[suffix].append(name)
            observed.add(name)

        for suffix in suffix_freq:
            suffix_freq[suffix] /= num_names

    while count > 0:
        suffix = generate_val(suffix_freq)
        if suffix not in suffix_models:
            if len(suffix_map[suffix]) > 1:
                suffix_models[suffix] = MarkovChain()
                names = map(lambda x: x.replace('.' + suffix, ''), suffix_map[suffix])
                suffix_models[suffix].train(list(names))
            else:
                continue
        name = suffix_models[suffix].generate_name() + '.' + suffix
        if name not in observed:
            observed.add(name)
            count -= 1
            print(name)
def main(args):
    """
    Parses command line arguments, and does the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """

    options = parse_args(args)  # This holds the nicely-parsed options object

    if options.k == 0:
        sys.stderr.write("Cannot use empty k-mers\n")
        sys.exit(1)

    if options.n == 0:
        sys.stderr.write("Cannot use no k-mers\n")
        sys.exit(1)

    sys.stderr.write('Load FASTA...\n')

    # We access the same strings so many times that if we index here we use many times the genome's size in memory.
    # There may be non-cached string views involved.
    # Just load the whole FASTA.
    index = SeqIO.to_dict(SeqIO.parse(options.fasta, "fasta"))

    sys.stderr.write('Analyze sequence sizes...\n')

    # Compute lengths of all sequences, for sampling
    sequence_names = list(index.keys())
    sequence_lengths = [len(index[name]) for name in sequence_names]

    # Weight sequences by how many full-length kmers fit in them
    sequence_weights = [max(l - (options.k - 1), 0) for l in sequence_lengths]
    # And get ready to look up how many start positions we have available in each sequence
    available_starts_by_name = dict(zip(sequence_names, sequence_weights))

    if sum(sequence_weights) == 0:
        # We can't sample anything actually
        sys.stderr.write("No long enough sequences in file\n")
        sys.exit(1)

    sys.stderr.write('Sample {}-mers...\n'.format(options.k))

    # Make the bloom filter for kmers and RCs
    acceptable = BloomFilter(max_elements=options.n,
                             error_rate=options.bloom_error)

    with enlighten.Counter(total=options.n,
                           desc='Sample',
                           unit='{}-mers'.format(options.k)) as bar:

        # This will be all the k-mers we want to count.
        # We use a counter because if we sample the same k-mer multiple times at the
        # sampling stage we want to count it multiple times for mappability
        # assessment.
        kmers = collections.Counter()
        kmers_sampled = 0
        while kmers_sampled < options.n:
            # Sample a k-mer
            sequence_name = random.choices(sequence_names, sequence_weights)[0]
            kmer_start = random.randint(
                0, available_starts_by_name[sequence_name])
            kmer = index[sequence_name].seq[kmer_start:kmer_start + options.k]

            # Convert to upper case
            kmer = kmer.upper()

            if not all_ACGT(kmer):
                # Reject this one for having unacceptable letters
                continue

            # All k-mers shall be forward strand

            # Note that we are looking for this k-mer
            kmers[kmer] += 1
            # And that we sampled one.
            kmers_sampled += 1

            # Record it and its RC in the Bloom filter
            acceptable.add(kmer)
            acceptable.add(kmer.reverse_complement())

            bar.update()

    sys.stderr.write('Count {}-mers...\n'.format(options.k))

    # Now traverse the whole FASTA and count
    counts = collections.Counter()

    with enlighten.Counter(total=sum(sequence_weights),
                           desc='Count',
                           unit='{}-mers'.format(options.k)) as bar:

        # We will do the counting in processes
        processes = multiprocessing.pool.Pool(options.thread_count)

        # We put the AsyncResults in this queue and handle them as they become ready.
        # A straggler could make it get a bit big.
        result_queue = collections.deque()

        def handle_result(result):
            """
            Process the return value from a counting job. Runs in main thread.
            """

            # See if something is done
            reply_counts, reply_kmers_processed = result
            # If so, handle it
            for kmer, count in reply_counts.items():
                if kmer in kmers:
                    # Not a bloom filter false positive
                    counts[kmer] += count
                # Also see if the reverse complement is there.
                # If we sampled a kmer and its RC then seeing one should count for both.
                # If we sampled a plaindrome we should count it twice every time we see it on the forward strand.
                rc = kmer.reverse_complement()
                if rc in kmers:
                    counts[rc] += count
            bar.update(reply_kmers_processed)

        for sequence in index.values():
            # Pre upper case everything
            sequence = sequence.upper()

            # Where is the past-end for the k-mer start positions?
            start_past_end = max(len(sequence) - (options.k - 1), 0)

            # Where will the next batch start in this sequence
            cursor = 0

            while cursor < start_past_end:
                # Work out how big a batch to make
                this_batch_size = min(options.batch_size,
                                      start_past_end - cursor)

                # Find the piece of sequence to process.
                # Make sure to provide the tail end where k-mer starts aren't.
                part = sequence.seq[cursor:cursor + this_batch_size +
                                    (options.k - 1)]

                async_result = processes.apply_async(
                    count_kmers, (options.k, acceptable, part))

                result_queue.append(async_result)

                cursor += this_batch_size

                while len(result_queue) > 0 and result_queue[0].ready():
                    # Pop off tasks that are done
                    handle_result(result_queue[0].get())
                    result_queue.popleft()

                while len(result_queue) > options.thread_count * 4:
                    # Too many things waiting. Wait and pick some up before continuing.
                    handle_result(result_queue[0].get())
                    result_queue.popleft()

        while len(result_queue) > 0:
            # Collect all the jobs left at the end
            handle_result(result_queue[0].get())
            result_queue.popleft()

    # Bucket k-mers by multiplicity in the genome.
    # We know they all appear at least once.
    kmers_by_multiplicity = collections.defaultdict(list)
    for kmer, multiplicity in counts.items():
        kmers_by_multiplicity[multiplicity].append(kmer)

    # Count up the total number of kmers with each multiplicity, properly
    # weighting multiple sampling
    kmers_with_multiplicity = {
        m: sum((kmers[k] for k in ks))
        for m, ks in kmers_by_multiplicity.items()
    }

    # Compute mappability
    effective_mapped = sum(
        (count / m for m, count in kmers_with_multiplicity.items()))
    possible_mapped = sum(kmers.values())

    print(
        "Expect to map {:.2f} {}-mers of out of {} total, for mappability of {:.2f}%"
        .format(effective_mapped, options.k, options.n,
                effective_mapped / possible_mapped * 100))

    for m in sorted(kmers_with_multiplicity.keys()):
        print("{} copies: \t{} sampled {}-mers".format(
            m, kmers_with_multiplicity[m], options.k))

    return 0
Beispiel #11
0
for fileid in movie_reviews.fileids('neg'):
  neg_reviews.extend(movie_reviews.words(fileid))
for fileid in movie_reviews.fileids('pos'):
  pos_reviews.extend(movie_reviews.words(fileid))

"""### Your task

In this Colab, you will develop a very simplistic spell-checker.  By no means you should think of using it for a real-world use case, but it is an interesting exercise to highlight the strenghts and weaknesses of Bloom Filters!
"""

from bloom_filter import BloomFilter

word_filter = BloomFilter(max_elements=236736)

for word in word_list:
  word_filter.add(word)

word_set = set(word_list)

"""If you executed the cell above, you now have 3 different variables in your scope:

1.   ```word_list```, a Python list containing the English dictionary (in case insensitive order)
2.   ```word_filter```, a Bloom filter where we have already added all the words in the English dictionary
3.   ```word_set```, a [Python set](https://docs.python.org/3.6/library/stdtypes.html#set-types-set-frozenset) built from the same list of words in the English dictionary

Let's inspect the size of each datastructure using the [getsizeof()](https://docs.python.org/3/library/sys.html#sys.getsizeof) method!
"""

from sys import getsizeof

print(f'Size of word_list (in bytes): {getsizeof(word_list)}')
class Partition:

    partitions = {}
    created_dirs = set()

    def __init__(self, partition_id, src_dir, dest_dir, dry_run, flatten):
        self.partition_id = partition_id
        self.src_dir = src_dir
        self.dest_dir = dest_dir
        self.dest_bloom = BloomFilter(max_elements=EST_MAX_FILES_PER_YEAR)
        self.dry_run = dry_run
        self.flatten = flatten

    def _dest_path(self, file_name):
        """
        Given a source path, determine a safe final destination path, disallowing any overwrites
        within a single job run.
        """
        if self.flatten:
            base = os.path.basename(file_name)
            (x, y) = os.path.splitext(base)
            # linear probing until we find an unused destination name
            tmp_dest = base
            i = 0
            while tmp_dest in self.dest_bloom:
                i += 1
                tmp_dest = ''.join([x, '-%d' % i, y])

            self.dest_bloom.add(tmp_dest)
            return os.path.join(self.dest_dir, str(self.partition_id),
                                tmp_dest)

        else:
            common_prefix = os.path.commonprefix([self.src_dir, file_name])
            path_suffix = file_name[len(common_prefix) + 1:]
            return os.path.join(self.dest_dir, str(self.partition_id),
                                path_suffix)

    def _ingest(self, file_name):

        dest_file_name = self._dest_path(file_name)

        if not self.dry_run:
            dest_dir = os.path.dirname(dest_file_name)

            if not dest_dir in Partition.created_dirs:
                Partition.created_dirs.add(dest_dir)
                if not os.path.isdir(dest_dir):
                    os.makedirs(dest_dir)

            shutil.copy2(file_name, dest_file_name)

        CMD_LOG.info('Partition %s\tcp %s %s' %
                     (self.partition_id, file_name, dest_file_name))

    @staticmethod
    def _get_partition(file_name, run_stats):
        exif = _read_exif_hachoir(file_name)
        if 'creation_date' in exif:
            p = _parse_exif_year(exif['creation_date'])
            if p:
                run_stats.count_partition_method('exif')
                return p

        path_year = _parse_filename_year(file_name)
        if path_year:
            run_stats.count_partition_method('path')
            return path_year

        run_stats.count_partition_method('unknown')
        return UNKNOWN_PARTITION

    @staticmethod
    def handle_file(file_name, src_dir, dest_dir, dry_run, flatten, run_stats):

        part = Partition._get_partition(file_name, run_stats)

        # if first time, do partition set-up
        if part not in Partition.partitions:
            Partition.partitions[part] = Partition(part, src_dir, dest_dir,
                                                   dry_run, flatten)

        Partition.partitions[part]._ingest(file_name)

        base, ext = os.path.splitext(file_name)
        run_stats.count_type(ext)
        run_stats.count_partition(part)
Beispiel #13
0
class Crawler:
    def __init__(self, start_urls: List[str], crawled_pages_count: int,
                 chunk_size: int, fetch_workers: int, database_workers: int):
        # При использоание set на больших объемах достигнем лимита
        # памяти.Поэтому используем фильтр блума, при этом проигрываем
        # по скорости.
        self._visited = BloomFilter(max_elements=crawled_pages_count)
        self._logger = get_logger(__name__)
        self._stop_crawling = False
        self._urls = start_urls
        self._data = []
        self._buffer = []
        self._total_crawled_pages = 0
        self._stop_crawling = False
        self._fetch_error_rate = 0.9
        self._crawled_pages_count = crawled_pages_count
        self._chunk_size = chunk_size
        self._fetch_workers = fetch_workers
        self._database_workers = database_workers
        self._max_buffer_len = self._chunk_size * self._fetch_error_rate

    def _get_urls(self) -> Generator:
        urls = self._urls
        self._urls = []
        for chunk in chunks_by_size(urls, self._chunk_size):
            yield chunk

    def _get_data(self, urls: List[str]):
        with ThreadPoolExecutor(self._fetch_workers) as executor:
            self._data = executor.map(fetch, urls)

    def _process_data(self):
        for status, url, clean_text, parsed_urls in self._data:
            if status != 0:
                continue
            self._visited.add(url)
            self._urls.extend(
                [u for u in parsed_urls if u not in self._visited])
            self._buffer.append((url, clean_text))

    def _save_data(self):
        self._total_crawled_pages += len(self._buffer)
        with ThreadPoolExecutor(self._database_workers) as executor:
            executor.map(bulk_insert,
                         chunks_by_count(self._buffer, self._database_workers))
        self._buffer = []

    def run(self):
        while True:
            if not self._urls or self._stop_crawling:
                self._logger.info('Total pages parsed: ' +
                                  str(self._total_crawled_pages))
                break

            for urls in self._get_urls():
                self._get_data(urls)
                self._process_data()
                if len(self._buffer) >= self._max_buffer_len:
                    self._save_data()

                if self._total_crawled_pages >= self._crawled_pages_count:
                    self._stop_crawling = True
                    break
     info, repl, last_info = log_parser(single_log, last_info)
     date = str(info[0]).split()[0].replace("-", "_")
     create_new_table(date, cursor)
     query_infos = []
     reply_infos = []
     while single_log:
         info, repl, last_info = log_parser(single_log, last_info)
         if info:
             query_infos.append(info)
         if repl:
             reply_infos.append(repl)
         single_log = f.readline()
         count += 1
         if count % 100000 == 0:
             query_info_sql(query_infos, cursor)
             reply_info_kfk(reply_infos)
             connection.commit()
             print(count)
             query_infos = []
     connection.commit()
 if enable_bloom_filter:
     tobe_check = []
     for domain in get_distinct_fld(date, cursor):
         domain = domain[0]
         if domain not in domain_filter:
             print(domain)
             domain_filter.add(domain)
             tobe_check.append(domain)
     with open("拉清单.txt", "a") as f:
         for i in tobe_check:
             f.write(i + "\n")
Beispiel #15
0
# import BF library
from bloom_filter import BloomFilter
from collections import defaultdict

d = defaultdict(int)
Inputs = ["1", "2", "3", "2", "2", "2", "2", "3", "3", "3"]

myBF = BloomFilter(max_elements=10, error_rate=0.001)
myBF2 = BloomFilter(max_elements=10, error_rate=0.01)
myBF3 = BloomFilter(max_elements=10, error_rate=0.01)

for x in Inputs:
    print(f'x {x}')
    if x not in myBF:  #90%
        print(f'adding bf1 {x}')
        myBF.add(x)  # 90% low high error
    elif x not in myBF2:
        print(f'adding bf2 {x}')
        myBF2.add(x)  # 90% low high error
    elif x not in myBF3:
        print(f'adding bf3 {x}')
        myBF3.add(x)  # 90% low high error
    else:
        d[x] += 1

print(d.keys())
Beispiel #16
0
class JiangxiSpider(scrapy.Spider):
    # 重点 启动参数
    name = 'jiangxi_spider'
    allowed_domains = ['jxsggzy.cn']

    #  初始化
    def __init__(self, *args, **kwargs):
        # // 要爬取网站的跟
        self.base_url = 'http://jxsggzy.cn/web/'
        # super(QhSpider, self).__init__(*args, **kwargs)
        self.bloom_filter = BloomFilter(max_elements=1000000,
                                        error_rate=0.1,
                                        filename='bf.data')
        self.num = 0

        self.scrawl_mode = ScrawlMode.HISTORY

        self._stop_parse = False

    # main 启动函数
    def start_requests(self):
        """
        爬虫默认接口,启动方法
        :return:
        """
        # 获取爬取时传过来的参数
        # command example:
        # py -3 -m scrapy crawl jiangxi_spider -a start_time="2019:01:01" -a end_time="2019:01:02"
        # assert self.start_time is not None
        # assert self.end_time is not None
        # self.scrawl_mode = ScrawlMode.REAL_TIME if str(self.start_time).lower() == 'now' else ScrawlMode.HISTORY
        #
        # if self.scrawl_mode == ScrawlMode.HISTORY:
        #     if (len(self.start_time) != 10 or len(self.end_time) != 10
        #             or self.start_time[4] != ':' or self.end_time[4] != ':'):
        #         logging.error('Bad date format. Example: 2019:01:01')
        #         return
        # else:x
        #     # 取当天日期
        #     _dt = datetime.fromtimestamp(time.time())
        #     self.start_time = _dt.strftime("%Y:%m:%d")
        #     self.end_time = self.start_time
        #
        info_type = {
            "01": {
                "name": "房屋及市政工程",
                "type": [1, 2, 3, 4]
            },
            "02": {
                "name": "交通工程",
                "type": [2, 3, 5]
            },
            "03": {
                "name": "水利工程",
                "type": [1, 2, 3, 4, 5]
            },
            "05": {
                "name": "重点工程",
                "type": [1, 2, 3, 4]
            },
            "06": {
                "name": "政府采购",
                "type": [1, 2, 3, 4, 5, 6]
            },
            "07": {
                "name": "国土资源交易",
                "type": [1, 2]
            },
            "08": {
                "name": "产权交易",
                "type": [3, 1, 2]
            },
            "09": {
                "name": "林权交易",
                "type": [1, 2]
            },
            "10": {
                "name": "医药采购",
                "type": [1, 2]
            },
            "13": {
                "name": "其他项目",
                "type": [1, 2]
            }
        }
        for _info_item in list(info_type.keys()):
            for _index, _info_item_num in enumerate(
                    info_type[_info_item]["type"]):
                _change_url = "http://jxsggzy.cn/web/jyxx/0020{}/0020{}00{}".format(
                    _info_item, _info_item, _info_item_num)

                _page_url = "http://jxsggzy.cn/web/jyxx/0020{}/0020{}00{}/1.html".format(
                    _info_item, _info_item, _info_item_num)
                _page_meta = {
                    "_info_item": _info_item,
                    "_info_item_num": _info_item_num,
                    "_index": _index,
                    "_change_url": _change_url,
                }
                time.sleep(1)
                yield scrapy.Request(url=_page_url,
                                     callback=self.parse_init,
                                     meta={'_page_meta': _page_meta})

    def parse_init(self, response):
        """
        :param response:
        :return:
        """
        self._stop_parse = False
        _total_num = response.xpath(
            '//span[@id="index"]/text()').extract_first()
        _total_num = _total_num.split('/')[1]
        if int(_total_num) > 0:
            try:
                for _page_num_item in range(int(_total_num)):
                    _page_init_detail_url = response.meta["_page_meta"][
                        "_change_url"]
                    _page_init_detail_url = _page_init_detail_url + "/{}.html".format(
                        _page_num_item + 1)
                    response.meta["_page_meta"][
                        "_page_init_detail_url"] = _page_init_detail_url
                    # if self._stop_parse:
                    #     break
                    time.sleep(1)
                    yield scrapy.Request(
                        url=_page_init_detail_url,
                        callback=self.parse_detail,
                        meta={'_page_meta': response.meta["_page_meta"]})
            except:
                logging.exception(' _total_num is faild {}'.format(
                    response.url))

    def parse_detail(self, response):
        # print(1)
        _info_type_detail = {
            "01": {
                "name": "房屋及市政工程",
                "type": ["招标公告", "答疑澄清", "文件下载", "中标公告"]
            },
            "02": {
                "name": "交通工程",
                "type": ["招标公告", "补疑书", "中标公告"]
            },
            "03": {
                "name": "水利工程",
                "type": ["资格预审公告/招标公告", "澄清补遗", "文件下载", "中标候选人公示", "中标结果公示"]
            },
            "05": {
                "name": "重点工程",
                "type": ["招标公告", "答疑澄清", "文件下载", "结果公示"]
            },
            "06": {
                "name": "政府采购",
                "type": ["采购公告", "变更公告", "答疑澄清", "结果公示", "单一来源公告", "合同公示"]
            },
            "07": {
                "name": "国土资源交易",
                "type": ["交易公告", "成交公告"]
            },
            "08": {
                "name": "产权交易",
                "type": ["信息披露", "交易公告", "成交公告"]
            },
            "09": {
                "name": "林权交易",
                "type": ["信息披露", "成交公示"]
            },
            "10": {
                "name": "医药采购",
                "type": ["采购公告", "结果公示"]
            },
            "13": {
                "name": "其他项目",
                "type": ["交易公告", "成交公示"]
            }
        }

        item = JiangXiItem()

        for selector in response.xpath('.//div[@class="ewb-infolist"]/ul/li'):
            time.sleep(random.randint(100, 200) / 1000.0)  # 100 - 200 ms
            # 公告所对应url
            _content_url = selector.xpath('.//a/@href').extract_first()
            _detail_page_url = response.urljoin(_content_url)
            item['url'] = _detail_page_url

            # 唯一标识
            _unq_id = CcgpUtil.get_unique_id(_detail_page_url)
            item['_id'] = _unq_id

            # 如果是重复数据,不处理
            if _unq_id in self.bloom_filter:
                continue

            self.bloom_filter.add(_unq_id)

            # 公告所在地区
            item['area'] = "江西"

            item['bid_type'] = _info_type_detail[_info_item_detail]["name"]
            print(_detail_page_url)
            # 公告所在具体地区
            # item['area_detail'] = self.__get_area_detail__(selector, _detail_page_url)

            # 招标人
            item['buyer'] = " "

            # 公告类型
            _index_detail = response.meta["_page_meta"]["_index"]
            _info_item_detail = response.meta["_page_meta"]["_info_item"]
            item['notice_type'] = _info_type_detail[_info_item_detail]["type"][
                _index_detail]

            # source
            item['source'] = "jx"

            # site
            item['site'] = "jx"

            # 公告所对应时间
            item['notice_time'] = self.__get_notice_time__(
                selector, _detail_page_url)
            # if self.start_time or self.end_time:
            #     try:
            #         self.start_time = self.start_time.split(" ")[0].replace(":", ".")
            #         self.end_time = self.end_time.split(" ")[0].replace(":", ".")
            #
            #         if len(self.start_time) == 10:
            #             self.start_time = self.start_time + " 00:00:00"
            #
            #         if len(self.end_time) == 10:
            #             self.end_time = self.end_time + " 00:00:00"
            #
            #     except:
            #         logging.exception(
            #             'self.start_time {} or self.end_time failed {}'.format(self.start_time,
            #                                                                    self.end_time))
            # print(self.start_time, item['notice_time'])
            # if self.start_time > item['notice_time'] or self.end_time < item['notice_time']:
            #     self._stop_parse = True
            #     logging.info('time interval')
            #     return
            # else:
            #     self._stop_parse = False

            # 公告的标题
            item['title'] = self.__get_title__(selector, _detail_page_url)

            # 内容
            item['content'] = self.__get_content__(selector, _detail_page_url)

            print(item)
            yield item

    @staticmethod
    def __get_area_detail__(selector, url):
        _ret = ''
        _area_detail = ["上绕市", "银川市", "石嘴山市", "吴忠市", "固原市", "中卫市"]

        try:
            _content_text = selector.xpath(
                'string(./div[@class="ewb-info-a"]/a)').extract()[0]
            _content_text = ''.join(_content_text.split())
            for _item in _area_detail:
                if _item in _content_text:
                    _ret = _item
                    break
        except:
            logging.exception('{} get_area_detail__ failed'.format(url))

        return _ret

    @staticmethod
    def __get_notice_time__(selector, url):
        _ret = ''
        try:
            _bid_info = selector.xpath(
                './/span[@class="ewb-list-date"]/text()').extract_first()
            if _bid_info:
                _ret = _bid_info.replace('-', '.') + " 00:00:00"
        except:
            logging.exception('{} get_notice_time failed'.format(url))

        return _ret

    @staticmethod
    def __get_title__(selector, url):
        _ret = ''
        try:
            _ret = selector.xpath(
                './a[@class="ewb-list-name"]/text()').extract_first().replace(
                    '\\n', '').rstrip().lstrip()
        except:
            logging.exception('{} get_title failed'.format(url))

        return _ret

    @staticmethod
    def __get_content__(selector, url):
        """
        正文内容
        如果提取正文内容失败,则判断此次爬取失败,所以这里不能用try except
        :param selector:
        :param url:
        :return:
        """
        _bad = False
        _ret = ''
        try:
            _r = requests.get(url, timeout=15)
            _r.encoding = 'utf-8'
            _ret = base64.b64encode(zlib.compress(
                _r.text.encode('utf-8'))).decode('utf-8')
        except:
            _bad = True

        # 如果有异常,重试一次
        if _bad:
            time.sleep(1)
            _r = requests.get(url, timeout=15)
            _r.encoding = 'utf-8'
            _ret = base64.b64encode(zlib.compress(
                _r.text.encode('utf-8'))).decode('utf-8')

        return _ret
Beispiel #17
0
    def get_urls_bloom_filter(urls):
        bloom_filter = BloomFilter(max_elements=10000, error_rate=0.001)
        for url in urls:
            bloom_filter.add(url)

        return bloom_filter
Beispiel #18
0
class SpicSpider(scrapy.Spider):
    # 重点 启动参数
    name = 'spic_spider'
    allowed_domains = ['cpeinet.com.cn']

    #  初始化
    def __init__(self, *args, **kwargs):
        # // 要爬取网站的跟
        self.base_url = 'http://www.cpeinet.com.cn/'
        # super(QhSpider, self).__init__(*args, **kwargs)
        self.bloom_filter = BloomFilter(max_elements=1000000,
                                        error_rate=0.1,
                                        filename='bf.data')
        self.num = 0

        self.scrawl_mode = ScrawlMode.HISTORY

        self._stop_parse = False

    # main 启动函数
    def start_requests(self):
        """
        爬虫默认接口,启动方法
        :return:
        """
        # 获取爬取时传过来的参数
        # command example:
        # py -3 -m scrapy crawl ccgp_search -a start_time="2019:01:01" -a end_time="2019:01:02"
        # assert self.start_time is not None
        # assert self.end_time is not None
        # self.scrawl_mode = ScrawlMode.REAL_TIME if str(self.start_time).lower() == 'now' else ScrawlMode.HISTORY
        #
        # if self.scrawl_mode == ScrawlMode.HISTORY:
        #     if (len(self.start_time) != 10 or len(self.end_time) != 10
        #             or self.start_time[4] != ':' or self.end_time[4] != ':'):
        #         logging.error('Bad date format. Example: 2019:01:01')
        #         return
        # else:
        #     # 取当天日期
        #     _dt = datetime.fromtimestamp(time.time())
        #     self.start_time = _dt.strftime("%Y:%m:%d")
        #     self.end_time = self.start_time
        #
        info_type = [1, 2, 3, 7, 33, 4, 5]
        for _info_item in info_type:
            _page_url = "http://www.cpeinet.com.cn/cpcec/bul/bul_list.jsp?type={}".format(
                _info_item)
            _page_meta = {"_info_item": _info_item, "_change_url": _page_url}
            time.sleep(1)
            yield scrapy.Request(url=_page_url,
                                 callback=self.parse_init,
                                 meta={'_page_meta': _page_meta})

    def parse_init(self, response):
        self._stop_parse = False
        _total_num = response.xpath(
            './/div[@class="page"]/font/text()').extract()[0]
        print(_total_num)

        if int(_total_num) > 0:
            try:
                for _page_num_item in range(int(_total_num)):
                    _page_init_detail_url = response.meta["_page_meta"][
                        "_change_url"]
                    _page_init_detail_url = _page_init_detail_url + "/{}.html".format(
                        _page_num_item + 1)
                    response.meta["_page_meta"][
                        "_page_init_detail_url"] = _page_init_detail_url
                    # if self._stop_parse:
                    #     break
                    time.sleep(1)
                    yield scrapy.Request(
                        url=_page_init_detail_url,
                        callback=self.parse_detail,
                        meta={'_page_meta': response.meta["_page_meta"]})
            except:
                logging.exception(' _total_num is faild {}'.format(
                    response.url))

    def parse_datail(self, response):
        item = SpicItem()
        for selector in response.xpath('.//div[@class="article_list_lb"]/li'):
            time.sleep(random.randint(100, 200) / 1000.0)  # 100 - 200 ms
            # 公告所对应url
            _content_url = selector.xpath('.//span/a/@href').extract_first()
            _detail_page_url = response.urljoin(_content_url)
            item['url'] = _detail_page_url

            # 唯一标识
            _unq_id = CcgpUtil.get_unique_id(_detail_page_url)
            item['_id'] = _unq_id

            # 如果是重复数据,不处理
            if _unq_id in self.bloom_filter:
                continue

            self.bloom_filter.add(_unq_id)

            # 公告所在地区
            item['area'] = "江西"

            print(_detail_page_url)
            # 公告所在具体地区
            # item['area_detail'] = self.__get_area_detail__(selector, _detail_page_url)

            # 招标人
            item['buyer'] = " "

            # 公告类型
            _index_detail = response.meta["_page_meta"]["_index"]
            _info_item_detail = response.meta["_page_meta"]["_info_item"]
            item['notice_type'] = _info_type_detail[_info_item_detail]["type"][
                _index_detail]

            # source
            item['source'] = "jx"

            # site
            item['site'] = "jx"

            # 公告所对应时间
            item['notice_time'] = self.__get_notice_time__(
                selector, _detail_page_url)
            # 公告的标题
            item['title'] = self.__get_title__(selector, _detail_page_url)

            # 内容
            item['content'] = self.__get_content__(selector, _detail_page_url)

            print(item)

        pass

    @staticmethod
    def __get_notice_time__(selector, url):
        _ret = ''
        try:
            _bid_info = selector['showdate']
            if _bid_info:
                _ret = _bid_info.replace('-', '.')
            if len(_bid_info) == 10:
                _bid_info = _bid_info + " 00:00:00"
            else:
                _ret = _ret.split(" ")[0] + " 00:00:00"
        except:
            logging.exception('{} get_notice_time failed'.format(url))

        return _ret

    @staticmethod
    def __get_title__(selector, url):
        _ret = ''
        try:
            _ret = selector['title']
        except:
            logging.exception('{} get_title failed'.format(url))

        return _ret

    @staticmethod
    def __get_content__(selector, url):
        """
        正文内容
        如果提取正文内容失败,则判断此次爬取失败,所以这里不能用try except
        :param selector:
        :param url:
        :return:
        """
        _bad = False
        _ret = ''
        try:
            _r = requests.get(url, timeout=15)
            _r.encoding = 'utf-8'
            _ret = base64.b64encode(zlib.compress(
                _r.text.encode('utf-8'))).decode('utf-8')
        except:
            _bad = True

        # 如果有异常,重试一次
        if _bad:
            time.sleep(1)
            _r = requests.get(url, timeout=15)
            _r.encoding = 'utf-8'
            _ret = base64.b64encode(zlib.compress(
                _r.text.encode('utf-8'))).decode('utf-8')

        return _ret
Beispiel #19
0
from bloom_filter import BloomFilter
# from pybloom import BloomFilter
fruit = BloomFilter(100000, error_rate=0.001, filename='/tmp/fruit.bloom')
# fruit = BloomFilter(100000, error_rate=0.001)
[fruit.add(x) for x in ['apple', 'pear', 'orange']]
print('aple' in fruit)
Beispiel #20
0
class TopBuzzVideo():
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Dalvik/2.1.0 (Linux; U; Android 8.0.0; MIX 2 MIUI/V10.2.2.0.ODECNXM) NewsArticle/8.4.4'
        }
        self.cookies = {
            'cookies':
            'install_id=6672646082571388678; ttreq=1$a9ed7f4ce8fc84fced473d6e25c22226f381c13d; odin_tt=3e76568447d177856560d524c6ef5400407a437cfdd62767a36fb3b2decdeb01d43b9a7978232dc05c57af3c81bd10c277e78619093795e8392c1302c9aa8a75; sid_guard=c8f84a23bcce86b376964aeb42991709%7C1554173959%7C5184000%7CSat%2C+01-Jun-2019+02%3A59%3A19+GMT; uid_tt=2ad7176029f7302e11b7924e6e6566b7120075732cedcd39bc999fa5cbcf07a1; sid_tt=c8f84a23bcce86b376964aeb42991709; sessionid=c8f84a23bcce86b376964aeb42991709'
        }
        self.headers_details = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
        }
        self.cookies_details = {
            'Cookie':
            'tt_webid=6683297640216282629; __tea_sdk__user_unique_id=6683297640216282629; __tea_sdk__ssid=40d2e59e-696c-4a93-ace8-e1479b10aeef; csrf-token=61575f8b568b577d9d06c777d103ae53e6c10723; csrf-secret=6qDUsFL6WZ1aG2soaPw7PpmCtnxCv7fw'
        }
        self.post_video_url = 'http://127.0.0.1:30008/crawler/video/transfer'
        self.filter_url = 'http://console.cc.clipclaps.tv/crawler/log'
        self.have_met = BloomFilter(max_elements=100000, error_rate=0.1)

    def run(self):
        number = 0
        while number < 5:
            t = time.time()
            result = re.findall('.\d*', str(t))  # 正则匹配时间戳小数位
            sign = tb.hash_code(result[1][1:])  # 对时间戳进行解密
            timestamp = result[0]
            start_url = 'https://i16-tb.isnssdk.com/api/844/stream?session_impr_id=0&tab=General&count=20&min_behot_time=1.554174097999E9&loc_mode=7&lac=4314&cid=6439033' \
                  '&sign=' + sign + \
                  '&timestamp=' + timestamp + \
                  '&logo=topbuzz&gender=0&bv_is_auto_play=0&youtube=0&manifest_version_code=844&app_version=8.4.4&iid=6672646082571388678&gaid=54b268f4-52c2-470c-a815-abd1d00acce9&original_channel=gp&channel=gp&fp=TlTrJzK1FYsqFYs5PlU1LMGSL2Xr&device_type=MIX+2&language=en&app_version_minor=8.4.4.01&resolution=2030*1080&openudid=ab50caa43e995042&update_version_code=8440&sys_language=zh&sys_region=cn&os_api=26&tz_name=Asia%2FShanghai&tz_offset=28800&dpi=440&brand=Xiaomi&ac=WIFI&device_id=6672637176796333574&os=android&os_version=8.0.0&version_code=844&hevc_supported=1&device_brand=Xiaomi&device_platform=android&sim_region=cn&region=us&aid=1106&ui_language=en'
            tb.analysis_topBuzz(start_url=start_url)
            number += 1
            time.sleep(random.uniform(60, 70))  # 每隔 1min 进行一次访问

    def hash_code(self, pwd):
        # 通过模块构造出一个hash对象
        h = hashlib.sha1()
        h.update(pwd.encode())
        # 获得字符串类型的加密后的密文
        return h.hexdigest()

    def analysis_topBuzz(self, start_url):
        try:
            res = requests.post(url=start_url,
                                headers=self.headers,
                                cookies=self.cookies).text
            time.sleep(random.uniform(1, 3))
            data = json.loads(res)
            item = data['data']['items']
            # 分析列表页,获得详情页url
            for i in range(len(item)):
                cls = item[i]['article_class']
                if cls == 'Video':
                    duration = item[i]['video']['duration']
                    if duration < 360:
                        share_url = item[i]['share_url']
                        video_url = item[i]['video']['url_list'][0]['urls'][0]
                        data = self.parsing_details_url(details_url=share_url,
                                                        video_url=video_url)
                        print('analysis_topBuzz_data:\n', data)
                        self.save_video(data=data)
                    else:
                        pass
        except:
            pass

    def parsing_details_url(self, details_url=None, video_url=None):
        status = self.filter_data(details_url=details_url)
        if status:
            print('Data already exists!')
        else:
            time.sleep(random.uniform(0, 3))
            result = requests.get(url=details_url,
                                  headers=self.headers_details,
                                  cookies=self.cookies_details).text
            html = etree.HTML(result)
            # 调度任务
            jobId = time.time()
            # 文章标题
            title = html.xpath('//div[@class="title"]/text()')[0]
            # 作者
            authorName = ' '.join(
                html.xpath('//div[@class="name active"]/text()'))
            if authorName == '':
                authorName = ' '.join(
                    html.xpath('//div[@class="name"]/text()'))
            # 文章发布时间
            releaseTime = ' '.join(
                html.xpath('//div[@class="publishTime"]/text()'))
            # 视频
            video = self.download_video(videoUrl=video_url)

            return {
                'jobId': jobId,
                'sourceUrl': details_url,
                'title': title,
                'authorName': authorName,
                'releaseTime': releaseTime,
                'video': video
            }

    def download_video(self, videoUrl):
        videoId = str(uuid.uuid4()).replace('-', '')
        downloadPath = '/data/crawler'
        videoPath = '/topbuzz/video/'
        urllib.request.urlretrieve(
            videoUrl, r'%s.mp4' % (downloadPath + videoPath + str(videoId)))
        video = '%s.mp4' % (videoPath + str(videoId))
        return video

    def filter_data(self, details_url):
        data1 = urllib.parse.urlencode({
            'type': int(4),
            'days': int(3),
        })
        data2 = data1.encode('utf-8')
        re = urllib.request.urlopen(url=self.filter_url, data=data2)
        status = re.read().decode('utf-8')
        result = json.loads(status)
        data = result['data']
        for kw in data:
            self.have_met.add(data[kw])
        if details_url in self.have_met:
            return True
        else:
            return False

    def save_video(self, data):
        data1 = urllib.parse.urlencode({
            'source': 1,
            'sourceUrl': data['sourceUrl'],
            'title': data['title'],
            'authorName': data['authorName'],
            'releaseTime': data['releaseTime'],
            'video': data['video'],
        })
        data2 = data1.encode('utf-8')
        re = urllib.request.urlopen(url=self.post_video_url, data=data2)
        status = re.read().decode('utf-8')
        print('status:\n', status)
Beispiel #21
0
    return makeMessage(magic, b"filterload", inv)

def mempoolMessage():
    return makeMessage(magic, b"mempool", b"\x00\x00\x00\x00")

def getHeadersMessage(hash):
    version = 70015
    return makeMessage(magic, b"getheaders", struct.pack('<ih32s32s', version, 1, unhexlify(hash), b'\x00'))

k, m = optimal_km(1, 0.001)

bfilter = BloomFilter(m, k)

data_to_hash = unhexlify("n4ewvXymapgcMARgjMNPvYy2BnCji95SMz")

bfilter.add(data_to_hash)


sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

sock.connect(("116.203.72.215", 18333))

sock.send(versionMessage())

a = sock.recv(24*8) # receive version
b = sock.recv(1000)

sock.send(verackMessage())

c = sock.recv(1000) # receive verack
Beispiel #22
0
class CompanySpider(scrapy.Spider):
    name = "companySpider"

    def __init__(self, name=None, **kwargs):
        self.bloom = BloomFilter(1000000, 0.001)
        for url in dbComponent.get_all_company_url():
            self.bloom.add(url)
        print("bloomFilter初始化完毕")
        super().__init__(name, **kwargs)

    def start_requests(self):
        # category
        file = open("category.txt", "r", encoding="utf-8")
        for url in file.readlines():
            url = url.replace("\n", "")
            yield scrapy.Request(url=url, callback=self.parse_list)
        print("start_requests 初始化完毕")

    # 解析列表页
    def parse_list(self, response):
        next_url = response.css(".PagedList-skipToNext a::attr(href)").extract_first()
        if (next_url is not None) and (next_url != ''):
            next_url = "https://vinabiz.org" + next_url
            yield scrapy.Request(url=next_url, callback=self.parse_list)
        url_list = response.css("h4 a::attr(href)").extract()
        for url in url_list:
            url = "https://vinabiz.org" + url
            if url not in self.bloom:
                yield scrapy.Request(url=url, callback=self.parse_company)

    # 解析详情页
    def parse_company(self, response):
        td_list = response.css("#wid-detail-info td")
        index = 0
        data = {}
        data['guid'] = str(uuid.uuid4()).replace("-", "")
        data['url'] = response.url
        while index < td_list.__len__():
            td = td_list[index]
            if td.css("::attr(class)").extract_first() == "bg_table_td":
                index += 1
                content = self.porcess_content(td_list[index])
                title = td.css("::text").extract_first()
                # BUSINESS
                if title == 'Tên chính thức':
                    data['official_name'] = content
                if title == 'Tên giao dịch':
                    data['trading_name'] = content
                if title == 'Mã doanh nghiệp':
                    data['business_code'] = content
                if title == 'Ngày cấp':
                    data['date_range'] = content
                if title == 'Cơ quan thuế quản lý':
                    data['tax_authorities_manage'] = content
                if title == 'Ngày bắt đầu hoạt động':
                    data['date_of_commencement_of_operation'] = content
                if title == 'Trạng thái':
                    data['status'] = content
                # CONTACT
                if title == 'Địa chỉ trụ sở':
                    data['office_address'] = content
                if title == 'Điện thoại':
                    data['phone1'] = content
                if title == 'Fax':
                    data['fax'] = content
                if title == 'Email':
                    data['email'] = content
                if title == 'Website':
                    data['website'] = content
                if title == 'Người đại diện':
                    data['representative'] = content
                if title == 'Điện thoại':
                    data['phone2'] = content
                if title == 'Địa chỉ người đại diện':
                    data['representative_address'] = content
                if title == 'Giám đốc':
                    data['manager'] = content
                if title == 'Điện thoại giám đốc':
                    data['phone_director'] = content
                if title == 'Địa chỉ giám đốc':
                    data['address_director'] = content
                if title == 'Kế toán':
                    data['accountant'] = content
                if title == 'Điện thoại kế toán':
                    data['phone_accounting'] = content
                if title == 'Địa chỉ kế toán':
                    data['account_address'] = content
                # INDUSTRY
                if title == 'Ngành nghề chính':
                    data['main_job'] = content
                if title == 'Lĩnh vực kinh tế':
                    data['economic_field'] = content
                if title == 'Loại hình kinh tế':
                    data['type_of_economic'] = content
                if title == 'Loại hình tổ chức':
                    data['type_of_organization'] = content
                if title == 'Cấp chương':
                    data['class_chapters'] = content
                if title == 'Loại khoản':
                    data['item_type'] = content
            index += 1
        dbComponent.add_company(data)

    def porcess_content(self, td):
        try:
            result = ''
            text_list = td.css("::text").extract()
            for text in text_list:
                result += text
            result = result.replace("\n", "")
            result = result.replace("'", "\\\'")
        except Exception as e:
            print(e)
        return result
Beispiel #23
0
class WebCrawler:
    """ The WebCrawler class implements a multithreaded web crawler starting from the base_url.

        The crawler runs based on the breadth-first search algorithm and starts running from the
        base_url and parsing the content to get more urls to be crawled.

        :param  base_url:       the input of the starting url of the web crawling
        :param  closure_url:    the stopping condition is based on this parameter
        :param  pool:           the threadpool
        :param  task_queue:     the task_queue that include all the urls that need to be crawled
        :param  crawled_pages:  a bloom filter used to eliminated visited pages from the BFS algorithm
        :param  total:          counter for the total number of pages being crawled
        :param  lock:           mutex lock to prevent race condition when read/write crawled_pages
        :param  run_time:       timer for the time spent on running the web crawler
    """
    def __init__(self, base_url: str, cfg: configparser) -> None:
        self.base_url = base_url
        self.config = cfg
        self.closure_url = '{scheme}://{netloc}'.format(
            scheme=urlsplit(self.base_url).scheme,
            netloc=urlsplit(self.base_url).netloc)
        self.pool = ThreadPoolExecutor(
            max_workers=int(self.config['MAX_WORKER']))
        self.task_queue = Queue(maxsize=3 * int(self.config['MAX_WORKER']))
        self.task_queue.put(self.base_url)
        self.crawled_pages = BloomFilter(
            max_elements=int(self.config['MAX_ELEMENTS']),
            error_rate=float(self.config['ERROR_RATE']))
        self.crawled_pages.add(self.base_url)
        self.total = 1
        self.lock = Lock()
        self.run_time = time.time()

    def _add_to_task_queue(self, child_url) -> None:
        """ Add url to the task queue concurrently

        If the child_url pass the bloom filter, it will be added to the task queue

        :param child_url:   the url to be added to the task queue
        :return: None
        """
        ret = self.lock.acquire(timeout=int(self.config['TIMEOUT']))
        if ret:
            if child_url not in self.crawled_pages:
                self.crawled_pages.add(child_url)
                self.total += 1
                self.lock.release()
                try:
                    self.task_queue.put(child_url,
                                        block=True,
                                        timeout=int(self.config['TIMEOUT']))
                except Full:
                    logger.error(
                        "Task queue full when putting {child_url}".format(
                            child_url=child_url))
                else:
                    logger.info("\t{child_url}".format(child_url=child_url))
            else:
                self.lock.release()
        else:
            logger.error("Lock timed out.")

    def _parse_html(self, html: str, parent_url: str) -> None:
        """ Parse the html content of the page from the parent_url.

            Get all the urls (must start with the closure_url as the stopping condition)
            from the html page and add them to the task queue.

            :param  html:       the html content of the parent_url
            :param  parent_url: the url of the html page to be parsed
            :return None
        """
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        logger.info("{parent_url}".format(parent_url=parent_url))
        for link in set(links):
            child_url = link['href']
            if child_url.startswith('/') or child_url.startswith(
                    self.closure_url):
                child_url = urljoin(self.closure_url, child_url)
                if child_url.endswith('/'):
                    child_url = child_url[:-1]
                    self._add_to_task_queue(child_url)

    def _callback(self, res: concurrent.futures) -> None:
        """ Callback when the html page is downloaded.

            :param  res: the request used to fetch the html page in _get_page
            :return None
        """
        result, url = res.result()
        if not result:
            return
        if result.status_code == 200 and 'html' in result.headers[
                'content-type']:
            self._parse_html(result.text, url)
        elif result.status_code in (301, 302):
            redirect_url = result.headers['Locations']
            if redirect_url.endswith('/'):
                redirect_url = redirect_url[:-1]
                logger.debug("{url} is redirected to {redirect_url}".format(
                    url=url, redirect_url=redirect_url))
                self._add_to_task_queue(redirect_url)

    def _get_page(self, url: str) -> (requests, str):
        """ Get the page from the url

            :param  url: the input of requests
            :return (requests, url)
        """
        try:
            res = requests.get(url, timeout=int(self.config['TIMEOUT']))
            return res, url
        except requests.RequestException as e:
            logger.warning('{e} for {url}'.format(e=e, url=url))
            return None, url

    def run(self) -> None:
        """ Run the webcrawler in a parallel fashion. The workers managed by
            the thread pool get tasks from the shared task queue. Once the worker
            is fired, it will get the url page by calling _get_page. When the page
            is got, the _callback will be called to parse the page to get more urls
            to be added to the task queue.
        """
        while True:
            try:
                target_url = self.task_queue.get(
                    timeout=int(self.config['TIMEOUT']))
                job = self.pool.submit(self._get_page, target_url)
                job.add_done_callback(self._callback)
            except Empty:
                return
            except Exception as e:
                logger.warning(e)
                continue

    def report(self) -> None:
        """ Report the wall time and total pages on the web crawler
        """
        self.pool.shutdown(wait=True)
        self.run_time = time.time() - self.run_time
        logger.info(
            "{time:.2f} seconds is spent to crawl on {total} pages".format(
                time=self.run_time, total=self.total))
    seqs = seqs.read().split('\n')
    # k = 10
    # x = -1.2
    # print(sys.getsizeof(x))
    # print(seqs[0])
    # kmers = []
    # for seq in seqs:
    #     kmers.append(get_kmers(seq, k))
    # for km in kmers:
    #     print(Counter(km))
    # print(get_kmers(seqs[0], k))
    # print(sys.getsizeof(kmers))

    bloom = BloomFilter(max_elements=16000, error_rate=0.05)
    bloom2 = BloomFilter(max_elements=6719656, error_rate=0.05)
    bloom.add(seqs[10])
    print(sys.getsizeof(bloom))
    # Test whether the bloom-filter has seen a key:
    with open('testfilter.bloom', 'wb') as testfilter:

        # Step 3
        pickle.dump(bloom, testfilter)

    with open('testfilter.bloom', 'rb') as testfilter:

        # Step 3
        bloom2 = pickle.load(testfilter)

        # After config_dictionary is read from file
        print(bloom2)
         for element in item['const_enum_arrays']
         if len(element['array']) >
         config.const_enum_array_length_threshold
         and len(item['const_enum_arrays']) > 0
     ]
 """build collection 'fn'"""
 if len(repo_features['func_names']) > 0:
     db_feature = mongoDB.mongodb_synic(
         host=config.db_host,
         port=config.db_port,
         db_feature=config.db_feature,
         collection_name=config.db_fn_collection)
     document = {}
     bf_fn = BloomFilter(max_elements=config.bloom_filter_volume,
                         error_rate=config.bloom_filter_error_rate)
     [bf_fn.add(element) for element in repo_features['func_names']]
     bf_fn_pickle = cPickle.dumps(bf_fn)
     document[file.split('.json')[0]] = bf_fn_pickle
     db_feature.do_add(document_item=document)
 """build collection '1g'"""
 if len(repo_features['strings']) > 0:
     db_feature = mongoDB.mongodb_synic(
         host=config.db_host,
         port=config.db_port,
         db_feature=config.db_feature,
         collection_name=config.db_1g_collection)
     document = {}
     bf_1g = BloomFilter(max_elements=config.bloom_filter_volume,
                         error_rate=config.bloom_filter_error_rate)
     [bf_1g.add(element) for element in repo_features['strings']]
     bf_1g_pickle = cPickle.dumps(bf_1g)
Beispiel #26
0
from Auth_app.models import User, FL_freq
from Auth_app.extract import extract_cf
from django.views.decorators.csrf import csrf_exempt
import json
import os
import time

from datetime import timedelta
from django.core.signing import TimestampSigner
from bloom_filter import BloomFilter  #bloomfilter stores the multiple characteristic frequencies
signer = TimestampSigner()

bloom = BloomFilter(max_elements=100000)
freq = range(15300, 15500)
for f in freq:
    bloom.add(f)


# Create your views here.
def home(request):
    if request.GET:
        # import pdb;pdb.set_trace()
        # user_list = User.objects.all()
        # User.objects.all().delete()
        # import pdb;pdb.set_trace()

        # res = User.objects.get_or_create(mac = request.GET['mac'])
        # User.objects.filter(mac = request.GET['mac']).update(gw_id = request.GET['gw_id']\
        # 	,gw_address=request.GET['gw_address'],gw_port=request.GET['gw_port'],url=request.GET['url'])

        # res[0].gw_id = request.GET['gw_id']
Beispiel #27
0
    def merge(self, merge_table):

        new_file  = open("tmp_file", 'w+')
        new_index = open("tmp_index", 'w+') 
        new_bloom = BloomFilter()

        self.max = merge_table.max if merge_table.max > self.max else self.max
        self.min = merge_table.min if merge_table.min < self.min else self.min

        cursor = csv.writer(new_file)
        index_cursor = csv.writer(new_index)

        allset = set() 

        input_file = open(merge_table.data_name, 'r')
        input_label = input_file.readline()
        input_label_arr = input_label.rstrip().split(',')
        allset |= set(input_label_arr)
        input_file.close()

        merged_file = open(self.data_name, 'r')
        merged_label = merged_file.readline()
        merged_label_arr = merged_label.rstrip().split(',')
        allset |= set(merged_label_arr)
        merged_file.close()

        count = 0
        header = []
        for val in allset:
            self.dataid[val] = count
            header.append(val)
            count = count + 1

        cursor.writerow(header)

        with open(merge_table.index_name, 'r') as file:
            input_lines = [line.strip() for line in file]

        with open(self.index_name, 'r') as file:
            merged_lines = [line.strip() for line in file]

        i = 0
        j = 0
        self.size = 0
        while (i < len(input_lines) and j < len(merged_lines)):
            #index 0 is key, index 1 is offset
            input_line = input_lines[i].split(',')
            merged_line = merged_lines[j].split(',')

            #index 0 is key, index 1 is offset
            input_file = open(merge_table.data_name, 'r')
            input_label = input_file.readline()
            input_label_arr = input_label.rstrip().split(',')
            input_lst = [""] * len(allset)
            input_file.seek(int(input_line[1]))
            input_value = input_file.readline()

            # Avoid merging deleted files
            if ("," not in input_value):
                i = i + 1
                input_file.close()
                continue

            input_arr = input_value.rstrip().split(',')

            #index 0 is key, index 1 is offset
            merged_file = open(self.data_name, 'r')
            merged_label = merged_file.readline()
            merged_label_arr = merged_label.rstrip().split(',')
            merged_lst = [""] * len(allset)
            merged_file.seek(int(merged_line[1]))
            merged_value = merged_file.readline()

            # Avoid merging deleted files
            if ("," not in merged_value):
                j = j + 1
                merged_file.close()
                continue

            merged_arr = merged_value.rstrip().split(',')

           #print("shit:%s" %(merged_line[0]))
            
            if (input_line[0] <= merged_line[0]):
                for k in range(len(input_label_arr)):
                    input_lst[self.dataid[input_label_arr[k]]] = input_arr[k]
                pos = new_file.tell()
                cursor.writerow(input_lst)
                file_pos = []
                file_pos.append(input_line[0])
                file_pos.append(pos)
                index_cursor.writerow(file_pos) 
                new_bloom.add(input_line[0])
                self.size = self.size + 1
                if (input_line[0] == merged_line[0]):
                    #print("sdfgsdfg:%s, i:%d" %(input_line[0], i))
                    j = j + 1
                i = i + 1
            elif (input_line[0] > merged_line[0]):
                for k in range(len(merged_label_arr)):
                    merged_lst[self.dataid[merged_label_arr[k]]] = merged_arr[k]
                pos = new_file.tell()
                cursor.writerow(merged_lst)
                file_pos = []
                file_pos.append(merged_line[0])
                file_pos.append(pos)
                index_cursor.writerow(file_pos) 
                new_bloom.add(merged_line[0])
                self.size = self.size + 1
                j = j + 1
            input_file.close()
            merged_file.close()

        while (i < len(input_lines)):
            input_line = input_lines[i].split(',')
            #index 0 is key, index 1 is offset
            input_file = open(merge_table.data_name, 'r')
            input_label = input_file.readline()
            input_label_arr = input_label.rstrip().split(',')
            input_lst = [""] * len(allset)
            input_file.seek(int(input_line[1]))
            input_value = input_file.readline()

            # Avoid merging deleted files
            if ("," not in input_value):
                i = i + 1
                input_file.close()
                continue

            new_bloom.add(input_line[0])
            input_arr = input_value.rstrip().split(',')
            for k in range(len(input_label_arr)):
                input_lst[self.dataid[input_label_arr[k]]] = input_arr[k]
            pos = new_file.tell()
            cursor.writerow(input_lst)
            file_pos = []
            file_pos.append(input_line[0])
            file_pos.append(pos)
            index_cursor.writerow(file_pos) 
            i = i + 1
            self.size = self.size + 1
            input_file.close()

        while (j < len(merged_lines)):
            merged_line = merged_lines[j].split(',')
            #index 0 is key, index 1 is offset
            merged_file = open(self.data_name, 'r')
            merged_label = merged_file.readline()
            merged_label_arr = merged_label.rstrip().split(',')
            merged_lst = [""] * len(allset)
            merged_file.seek(int(merged_line[1]))
            merged_value = merged_file.readline()

            # Avoid merging deleted files
            if ("," not in merged_value):
                j = j + 1
                merged_file.close()
                continue

            new_bloom.add(merged_line[0])
            merged_arr = merged_value.rstrip().split(',')
            for k in range(len(merged_label_arr)):
                merged_lst[self.dataid[merged_label_arr[k]]] = merged_arr[k]
            pos = new_file.tell()
            cursor.writerow(merged_lst)
            file_pos = []
            file_pos.append(merged_line[0])
            file_pos.append(pos)
            index_cursor.writerow(file_pos) 
            j = j + 1
            self.size = self.size + 1
            merged_file.close()

        
        new_file.close()
        new_index.close()
        os.remove(self.data_name)
        os.remove(self.index_name)
        os.remove(merge_table.data_name)
        os.remove(merge_table.index_name)
        self.file = new_file
        self.index = new_index
        self.bloom = new_bloom

        os.rename("tmp_file", self.data_name)
        os.rename("tmp_index", self.index_name)
Beispiel #28
0
class Main():
    def __init__(self):
        self.have_met = BloomFilter(max_elements=100000, error_rate=0.1)
        self.t = time.time()
        self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
        self.post_DB = True

    def mainTopBuzz(self):
        n = NewsFeeds()
        s = SaveSqlDb()
        tb = TopBuzz()

        # 访问时间
        t = time.time()
        #正则匹配时间戳小数位
        result = re.findall('.\d*', str(t))
        sign = tb.hash_code(result[1][1:])
        timestamp = result[0]
        url_tb = 'https://i16-tb.isnssdk.com/api/844/stream?session_impr_id=0&tab=General&count=20&min_behot_time=1.554174097999E9&loc_mode=7&lac=4314&cid=6439033' \
              '&sign='+sign+ \
              '&timestamp='+timestamp+ \
              '&logo=topbuzz&gender=0&bv_is_auto_play=0&youtube=0&manifest_version_code=844&app_version=8.4.4&iid=6672646082571388678&gaid=54b268f4-52c2-470c-a815-abd1d00acce9&original_channel=gp&channel=gp&fp=TlTrJzK1FYsqFYs5PlU1LMGSL2Xr&device_type=MIX+2&language=en&app_version_minor=8.4.4.01&resolution=2030*1080&openudid=ab50caa43e995042&update_version_code=8440&sys_language=zh&sys_region=cn&os_api=26&tz_name=Asia%2FShanghai&tz_offset=28800&dpi=440&brand=Xiaomi&ac=WIFI&device_id=6672637176796333574&os=android&os_version=8.0.0&version_code=844&hevc_supported=1&device_brand=Xiaomi&device_platform=android&sim_region=cn&region=us&aid=1106&ui_language=en'
        news_list = tb.sendRequest(url=url_tb)
        path = '/data/crawler'
        pic_path = '/topbuzz/picture/'
        number = 1
        for url in news_list:
            if url not in self.have_met:
                self.have_met.add(url)
                data = n.parsingUrl(url=url,
                                    downloadPath=path,
                                    picPath=pic_path)
                if data is None:
                    pass
                else:
                    print('TB_detail_url\t', url)
                    print('TB_number\t', number)
                    number += 1
                    if data['releaseTime'] is None or data['releaseTime'] == '':
                        data['releaseTime'] = str(self.point_time)
                    if self.post_DB:
                        s.saveDB(data=data, source=1)
                    else:
                        s.saveMySql(data=data)
            else:
                pass

    def mainNewsBreak(self):
        n = NewsFeeds()
        s = SaveSqlDb()
        nb = NewsBreak()
        url_nb = 'http://api.particlenews.com/Website/channel/news-list-for-best-channel?cstart=0&infinite=true&refresh=1&epoch=5&distribution=newsbreak&platform=1&cv=4.7.3&cend=10&appid=newsbreak&weather=true&fields=docid&fields=date&fields=image&fields=image_urls&fields=like&fields=source&fields=title&fields=url&fields=comment_count&fields=fb_share_total&fields=coach_mark_text&fields=up&fields=down&fields=summary&fields=favicon_id&fields=dominant_image&fields=contextMeta&fields=video_urls&fields=viewType&push_refresh=0&modularize=true&ts=2019-04-07+18%3A14%3A01+%2B0800&version=020025&net=wifi'
        docId = nb.parsingPost(url=url_nb)
        get_url = 'http://api.particlenews.com/Website/contents/content?related_docs=false&cv=4.7.3' \
                  '&docid=' + docId + \
                  '&appid=newsbreak&bottom_channels=false&distribution=newsbreak&platform=1&version=020025&net=wifi'
        news_list = nb.parsingGet(url=get_url)
        path = '/data/crawler'
        pic_path = '/newsbreak/picture/'
        number = 1
        for url in news_list:
            if url not in self.have_met:
                self.have_met.add(url)
                data = n.parsingUrl(url=url,
                                    downloadPath=path,
                                    picPath=pic_path)
                if data is None:
                    pass
                else:
                    print('NB_detail_url\t', url)
                    print('NB_number\t', number)
                    number += 1
                    if data['releaseTime'] is None or data['releaseTime'] == '':
                        data['releaseTime'] = str(self.point_time)
                    if self.post_DB:
                        s.saveDB(data=data, source=2)
                    else:
                        s.saveMySql(data=data)
            else:
                pass

    def mainBuzzFeed(self):
        n = NewsFeeds()
        s = SaveSqlDb()
        bf = BuzzFeed()
        top_urls = bf.parsingTopUrl()
        news_urls = bf.parsingNewsUrl()
        urls_list = top_urls + news_urls
        path = '/data/crawler'
        pic_path = '/buzzfeed/picture/'
        number = 1
        for url in urls_list:
            if url not in self.have_met:
                self.have_met.add(url)
                data = n.parsingUrl(url=url,
                                    downloadPath=path,
                                    picPath=pic_path)
                if data is None:
                    pass
                else:
                    print('BF_detail_url\t', url)
                    print('BF_number\t', number)
                    number += 1
                    if data['releaseTime'] is None or data['releaseTime'] == '':
                        data['releaseTime'] = str(self.point_time)
                    if self.post_DB:
                        s.saveDB(data=data, source=3)
                    else:
                        s.saveMySql(data=data)
            else:
                pass

    def mainGoogleNews(self):
        n = NewsFeeds()
        s = SaveSqlDb()
        gn = GoogleNews()
        news_list = gn.googleNews()
        path = '/data/crawler'
        pic_path = '/googleNews/picture/'
        number = 1
        for new in news_list:
            url = new.link.text
            if url not in self.have_met:
                data = n.parsingUrl(url=url,
                                    downloadPath=path,
                                    picPath=pic_path)
                if data is None:
                    pass
                else:
                    print('GN_detail_url\t', url)
                    print('GN_number\t', number)
                    number += 1
                    if data['releaseTime'] is None or data['releaseTime'] == '':
                        data['releaseTime'] = str(self.point_time)
                    if self.post_DB:
                        s.saveDB(data=data, source=4)
                    else:
                        s.saveMySql(data=data)
            else:
                pass

    def mainSmartNews(self):
        n = NewsFeeds()
        s = SaveSqlDb()
        sm = SmartNews()
        news_list = sm.smartNews()
        path = '/data/crawler'
        pic_path = '/smartNews/picture/'
        number = 1
        for new in news_list:
            if new not in self.have_met:
                self.have_met.add(new)
                data = n.parsingUrl(url=new,
                                    downloadPath=path,
                                    picPath=pic_path)
                if data is None:
                    pass
                else:
                    print('SM_detail_url\t', new)
                    print('SM_number\t', number)
                    number += 1
                    if data['releaseTime'] is None or data['releaseTime'] == '':
                        data['releaseTime'] = str(self.point_time)
                    if self.post_DB:
                        s.saveDB(data=data, source=5)
                    else:
                        s.saveMySql(data=data)
            else:
                pass
Beispiel #29
0
                print("已处理日志", count)
                query_infos = []
                print("正在进行日志处理")


    sql = "INSERT INTO long_domain_queries"\
        "(query_time, query_domain, query_fld, query_client_ip) VALUES(%s, %s, %s, %s)"
    cursor.executemany(sql, long_query_infos)
    sql = "INSERT INTO suspect_client_ip(ip, date, value) VALUES(%s, %s, 1) "\
        "ON DUPLICATE KEY UPDATE value=value+1"
    cursor.executemany(sql, black_query_infos)
    connection.commit()
    tobe_check = []
    for record in get_distinct_fld(date, cursor):
        domain = record[0]
        if domain not in visited_domain_filter:
            print(domain)
            sql = '''
                INSERT INTO domain_first_seen 
                SELECT query_fld, MIN(query_time)
                FROM queries_2020_01_07
                WHERE query_fld=%s
            '''
            cursor.execute(sql, domain)
            visited_domain_filter.add(domain)
            tobe_check.append(domain)
    with open("拉清单.txt", "a") as f:
        for i in tobe_check:
            f.write(i + "\n")
    connection.commit()
Beispiel #30
0
class DeyangSpider(scrapy.Spider):
    # 四川德阳公共资源交易
    name = 'deyang_spider'
    allowed_domains = ['ggzyxx.deyang.gov.cn']

    #  初始化
    def __init__(self, *args, **kwargs):
        # // 要爬取网站的跟
        self.base_url = 'http://ggzyxx.deyang.gov.cn/'
        super(DeyangSpider, self).__init__(*args, **kwargs)
        self.bloom_filter = BloomFilter(max_elements=1000000,
                                        error_rate=0.1,
                                        filename='bf.data')
        self.num = 0
        self.scrawl_mode = ScrawlMode.HISTORY
        self._stop_parse = False

        # main 启动函数

    def start_requests(self):
        """
        爬虫默认接口,启动方法
        :return:
        """
        _info_type = {
            "tradeinfo_jygcjs_": {"工程建设"},
            "tradeinfo_jycg_": {"政府采购"},
            "tradeinfo_gygt_": {"国土矿业权"}
            # , "tradeinfo_jygzcq_": {"国资产权"}
        }

        for _info_item in (_info_type.keys()):
            _change_url = "http://ggzyxx.deyang.gov.cn/pub/{}/".format(
                _info_item)
            _page_url = "http://ggzyxx.deyang.gov.cn/pub/{}.html".format(
                _info_item)
            _page_meta = {"_info_item": _info_item}
            time.sleep(1)
            yield scrapy.Request(url=_page_url,
                                 callback=self.parse_init,
                                 meta={'_page_meta': _page_meta})

    def parse_init(self, response):
        """
        :param response:
        :return:
        """
        self._stop_parse = False
        _total_num = response.xpath(
            './/div[@class="pagenations"]/a/text()').extract[4]
        print(_total_num)
        print("----------------------------")

        # if int(_total_num) > 0:
        #     try:
        #         for _page_num_item in range(int(_total_num)):
        #             _page_init_detail_url = response.meta["_page_meta"]["_change_url"]
        #             _page_init_detail_url = _page_init_detail_url + "/{}.html".format(_page_num_item + 1)
        #             response.meta["_page_meta"]["_page_init_detail_url"] = _page_init_detail_url
        #
        #             time.sleep(1)
        #             yield scrapy.Request(url=_page_init_detail_url, callback=self.parse_detail,
        #                                  meta={'_page_meta': response.meta["_page_meta"]})
        #     except:
        #         logging.exception(' _total_num is faild {}'.format(response.url))

    def parse_detail(self, response):
        _info_type_detail = {{
            "tradeinfo_jygcjs_": "工程建设"
        }, {
            "tradeinfo_jycg_": "政府采购"
        }, {
            "tradeinfo_gygt_": "国土矿业权"
        }
                             # , {"tradeinfo_jygzcq_": "国资产权"}
                             }

        item = DeyangItem()
        for selector in response.xpath('.//div[@class="search-result"]/ul/li'):
            time.sleep(random.randint(100, 200) / 1000.0)  # 100 - 200 ms
            # 公告所对应url
            _content_url = selector.xpath('./a/@href').extract_first()
            _detail_page_url = response.urljoin(_content_url)
            item['url'] = _detail_page_url

            # 唯一标识
            _unq_id = CcgpUtil.get_unique_id(_detail_page_url)
            item['_id'] = _unq_id

            # 如果是重复数据,不处理
            if _unq_id in self.bloom_filter:
                continue

            self.bloom_filter.add(_unq_id)

            # 公告所在地区
            item['area'] = "德阳市"

            print(_detail_page_url)
            # 公告所在具体地区
            # item['area_detail'] = self.__get_area_detail__(selector, _detail_page_url)

            # 招标人
            item['buyer'] = " "

            # 公告类型
            _index_detail = response.meta["_page_meta"]["_index"]
            _info_item_detail = response.meta["_page_meta"]["_info_item"]
            item['notice_type'] = _info_type_detail[_info_item_detail][
                _index_detail]

            # source
            item['source'] = "deYang"

            # site
            item['site'] = "deYang"

            # 公告所对应时间
            item['notice_time'] = self.__get_notice_time__(
                selector, _detail_page_url)
            # 公告的标题
            item['title'] = self.__get_title__(selector, _detail_page_url)

            # 内容
            item['content'] = self.__get_content__(selector, _detail_page_url)

            print(item)

    @staticmethod
    def __get_area_detail__(selector, url):
        _ret = ''
        _area_detail = ["上绕市", "银川市", "石嘴山市", "吴忠市", "固原市", "中卫市"]

        try:
            _content_text = selector.xpath(
                'string(./div[@class="ewb-info-a"]/a)').extract()[0]
            _content_text = ''.join(_content_text.split())
            for _item in _area_detail:
                if _item in _content_text:
                    _ret = _item
                    break
        except:
            logging.exception('{} get_area_detail__ failed'.format(url))

        return _ret

    @staticmethod
    def __get_notice_time__(selector, url):
        _ret = ''
        try:
            _bid_info = selector.xpath(
                './span[@class="time"]/text()').extract_first()
            if _bid_info:
                _ret = _bid_info.replace('-', '.') + " 00:00:00"
        except:
            logging.exception('{} get_notice_time failed'.format(url))
        return _ret

    @staticmethod
    def __get_title__(selector, url):
        _ret = ''
        try:
            _ret = selector.xpath(
                './a[@class="weekdays"]/text()').extract_first().replace(
                    '\\n', '').rstrip().lstrip()
        except:
            logging.exception('{} get_title failed'.format(url))

        return _ret

    @staticmethod
    def __get_content__(selector, url):
        """
        正文内容
        如果提取正文内容失败,则判断此次爬取失败,所以这里不能用try except
        :param selector:
        :param url:
        :return:
        """
        _bad = False
        _ret = ''
        try:
            _r = requests.get(url, timeout=15)
            _r.encoding = 'utf-8'
            _ret = base64.b64encode(zlib.compress(
                _r.text.encode('utf-8'))).decode('utf-8')
        except:
            _bad = True

        # 如果有异常,重试一次
        if _bad:
            time.sleep(1)
            _r = requests.get(url, timeout=15)
            _r.encoding = 'utf-8'
            _ret = base64.b64encode(zlib.compress(
                _r.text.encode('utf-8'))).decode('utf-8')

        return _ret
class MyFilterSwitch(app_manager.RyuApp):
    OFP_VERSIONS = [ofproto_v1_3.OFP_VERSION]

    def _monitor(self):
        self.web = FlaskAppWrapper('wrap')
        self.web.add_endpoint(endpoint='/add',
                              endpoint_name='add',
                              handler=self.action)
        self.web.run()

    def __init__(self, *args, **kwargs):
        super(MyFilterSwitch, self).__init__(*args, **kwargs)
        self.swList = {}  #daftar switch
        self.hostDB = {}  #berisi pairing antara host.ID - port number
        self.bloom = BloomFilter(max_elements=10000, error_rate=0.1)
        self.randomFilter(12345678, 1000)
        self.monitor_thread = hub.spawn(self._monitor)

    def randomFilter(self, seed, maxcount):
        random.seed(seed)

        def randomIP(upperbound):
            return str(random.randint(1, upperbound))

        allIP = []
        for i in range(maxcount):
            newIP = "10." + "0" + "." + randomIP(10) + "." + randomIP(250)
            allIP.append(newIP)
            self.bloom.add(newIP)
        print "IP address blocked: " + str(len(allIP))

    def action(self):
        def dropNewIP(dst_ip):
            dp = self.swList[1]
            match = dp.ofproto_parser.OFPMatch(in_port=1,
                                               eth_type=0x800,
                                               ip_proto=0x11,
                                               ipv4_dst=dst_ip)
            actions = []
            self.add_flow(dp, match, actions, 2)
            print("update flow rule, match: IP to " + dst_ip + " output:drop")

        if request.method != 'POST':
            ipadr = str(request.args.get('ip'))
            if ipadr in self.bloom:
                print "IP address: " + ipadr + " already exist!"
            else:
                print "Adding IP address: " + ipadr + " into the black list"
                self.bloom.add(ipadr)
                dropNewIP(ipadr)
            return "Hello world"

    def add_flow(self, datapath, match, actions, priority=1):
        ofproto = datapath.ofproto

        #instruksi dasar untuk mengeksekusi semua perintah di daftar actions
        inst = [
            datapath.ofproto_parser.OFPInstructionActions(
                ofproto.OFPIT_APPLY_ACTIONS, actions)
        ]
        mod = datapath.ofproto_parser.OFPFlowMod(
            datapath=datapath,  #switch id
            cookie=0,
            cookie_mask=0,
            table_id=0,  #nomor Flow table dimana flow rule di install 
            command=ofproto.OFPFC_ADD,
            idle_timeout=0,
            hard_timeout=0,  #timeout = 0 -> tidak memiliki timeout
            priority=priority,  #menentukan urutan matching
            buffer_id=ofproto.OFP_NO_BUFFER,
            out_port=ofproto.OFPP_ANY,
            out_group=ofproto.OFPG_ANY,
            flags=0,
            match=match,  #perintah match
            instructions=inst)  #perintah actions
        datapath.send_msg(mod)

    @set_ev_cls(ofp_event.EventOFPSwitchFeatures, CONFIG_DISPATCHER)
    def switch_features_handler(self, ev):
        msg = ev.msg
        dp = msg.datapath
        ofproto = dp.ofproto

        self.swList[dp.id] = dp

        #semua paket IP tanyakan ke controller
        match = dp.ofproto_parser.OFPMatch(eth_type=0x800)
        actions = [
            dp.ofproto_parser.OFPActionOutput(ofproto.OFPP_CONTROLLER,
                                              ofproto.OFPCML_NO_BUFFER)
        ]

        self.add_flow(dp, match, actions)

    @set_ev_cls(ofp_event.EventOFPPacketIn, MAIN_DISPATCHER)
    def _packet_in_handler(self, ev):
        msg = ev.msg
        in_port = msg.match['in_port']
        dp = msg.datapath
        ofproto = dp.ofproto
        dpid = dp.id
        pkt = packet.Packet(msg.data)

        pkt_ipv4 = pkt.get_protocols(ipv4.ipv4)[0]

        if pkt_ipv4:
            dst_ip = pkt_ipv4.dst
            print "new request: ", dst_ip
            if dst_ip in self.bloom:
                #pasang flow rule untuk mendrop paket
                match = dp.ofproto_parser.OFPMatch(in_port=1,
                                                   eth_type=0x800,
                                                   ip_proto=0x11,
                                                   ipv4_dst=dst_ip)
                actions = []
                self.add_flow(dp, match, actions, 2)
                print("install flow rule, match: IP to " + dst_ip +
                      " output:drop")
            else:
                #pasang flow rule untuk memforward paket ke host 2
                actions = [dp.ofproto_parser.OFPActionOutput(2, 0)]
                data = msg.data
                out = dp.ofproto_parser.OFPPacketOut(datapath=dp,
                                                     buffer_id=msg.buffer_id,
                                                     in_port=in_port,
                                                     actions=actions,
                                                     data=data)
                dp.send_msg(out)

                #install flowrule untuk memforward paket ke host 2 tanpa menghubungi controller
                match = dp.ofproto_parser.OFPMatch(in_port=1,
                                                   eth_type=0x800,
                                                   ip_proto=0x11,
                                                   ipv4_dst=dst_ip)
                actions = [dp.ofproto_parser.OFPActionOutput(2, 0)]
                self.add_flow(dp, match, actions, 2)
                print("install flow rule, match: IP to " + dst_ip +
                      " output:2")
Beispiel #32
0
    list_FN_positions = list()
    cnt = 0
    for label in y_label:
        if label == 1:
            if (y_pred[cnt] == 0):
                list_FN_positions.append(cnt)
        cnt = cnt + 1

    bloom = None
    from bloom_filter import BloomFilter
    if len(list_FN_positions) > 0:
        bloom = BloomFilter(max_elements=len(list_FN_positions),
                            error_rate=fpr_b)

        for idx in list_FN_positions:
            bloom.add(str(X[idx, 0]))

        #Memory usage
        print("Number of bits:", bloom.num_bits_m)

    end = datetime.datetime.now(
    )  # BUILD TIME ENDS - Please use a different function to take this in micro seconds as you did earlier
    print('Model + Trad Bloom Build Time: ', str(end - start))

    # serialize model
    from joblib import dump, load
    filename = 'model_rbf' + str(np.random.randint(1000))
    # print(filename)
    dump(rbf_svc, filename + '.compressed', compress=True)
    fName = filename + '.compressed'
    print('Model stored in ', fName)
Beispiel #33
0
print("Number of Items: " + str(num_of_items))
p = 0.005
print("Probability of false positive error " + str(p))


bit_size = bit_array_size(num_of_items, p)
print("Bit Size: "+str(bit_size))

hash_size = size_of_hash(num_of_items, bit_size)
print("Hash Size: "+str(hash_size))

bf = BloomFilter(num_of_items, hash_size)
word_list = open("word_list.txt").read().splitlines()

for word in word_list:
    bf.add(word)
word_list.close()
    
print(bf.lookup("99"))

print(bf.lookup("donkey")) 
print(bf.lookup("oitqv")) 
print(bf.lookup("fart"))
print(bf.lookup("Max"))
print(bf.lookup("Dichha"))
print(bf.lookup("Khuwalung"))

print("++++Random Word SpellChecker++++")


alpha=""