コード例 #1
0
def get_insert_product_with_offer(offer) -> List[InsertOne]:
    return InsertOne({
        **get_new_product_from_offer(offer), "offers": [offer["_id"]]
    })
コード例 #2
0
    def test_cluster_time(self):
        listener = SessionTestListener()
        # Prevent heartbeats from updating $clusterTime between operations.
        client = rs_or_single_client(event_listeners=[listener],
                                     heartbeatFrequencyMS=999999)
        collection = client.pymongo_test.collection
        # Prepare for tests of find() and aggregate().
        collection.insert_many([{} for _ in range(10)])
        self.addCleanup(collection.drop)
        self.addCleanup(client.pymongo_test.collection2.drop)

        def bulk_insert(ordered):
            if ordered:
                bulk = collection.initialize_ordered_bulk_op()
            else:
                bulk = collection.initialize_unordered_bulk_op()
            bulk.insert({})
            bulk.execute()

        def rename_and_drop():
            # Ensure collection exists.
            collection.insert_one({})
            collection.rename('collection2')
            client.pymongo_test.collection2.drop()

        def insert_and_find():
            cursor = collection.find().batch_size(1)
            for _ in range(10):
                # Advance the cluster time.
                collection.insert_one({})
                next(cursor)

            cursor.close()

        def insert_and_aggregate():
            cursor = collection.aggregate([], batchSize=1).batch_size(1)
            for _ in range(5):
                # Advance the cluster time.
                collection.insert_one({})
                next(cursor)

            cursor.close()

        ops = [
            # Tests from Driver Sessions Spec.
            ('ping', lambda: client.admin.command('ping')),
            ('aggregate', lambda: list(collection.aggregate([]))),
            ('find', lambda: list(collection.find())),
            ('insert_one', lambda: collection.insert_one({})),

            # Additional PyMongo tests.
            ('insert_and_find', insert_and_find),
            ('insert_and_aggregate', insert_and_aggregate),
            ('update_one',
             lambda: collection.update_one({}, {'$set': {'x': 1}})),
            ('update_many',
             lambda: collection.update_many({}, {'$set': {'x': 1}})),
            ('delete_one', lambda: collection.delete_one({})),
            ('delete_many', lambda: collection.delete_many({})),
            ('bulk_write', lambda: collection.bulk_write([InsertOne({})])),
            ('ordered bulk', lambda: bulk_insert(True)),
            ('unordered bulk', lambda: bulk_insert(False)),
            ('rename_and_drop', rename_and_drop),
        ]

        for name, f in ops:
            listener.results.clear()
            # Call f() twice, insert to advance clusterTime, call f() again.
            f()
            f()
            collection.insert_one({})
            f()

            self.assertGreaterEqual(len(listener.results['started']), 1)
            for i, event in enumerate(listener.results['started']):
                self.assertTrue(
                    '$clusterTime' in event.command,
                    "%s sent no $clusterTime with %s" % (
                        f.__name__, event.command_name))

                if i > 0:
                    succeeded = listener.results['succeeded'][i - 1]
                    self.assertTrue(
                        '$clusterTime' in succeeded.reply,
                        "%s received no $clusterTime with %s" % (
                            f.__name__, succeeded.command_name))

                    self.assertTrue(
                        event.command['$clusterTime']['clusterTime'] >=
                        succeeded.reply['$clusterTime']['clusterTime'],
                        "%s sent wrong $clusterTime with %s" % (
                            f.__name__, event.command_name))
コード例 #3
0
def article(headers2):
    a = 0
    list = guanjianci.key_list

    a = a + 1
    for lis in list:
        try:
            from random import choice
            par = dict(params)
            par['q'] = lis
            arr = [50]
            arrs = choice(arr)
            time.sleep(int(arrs))
            response = ss.get('https://www.facebook.com/search/posts',
                              params=par,
                              headers=headers2)
            print(response.status_code)
            # if response.status_code != 200:
            #     headers2 = headers1
            #     pass
            content = response.content.decode('utf-8')
            id = re.compile('"id":"vm-(.*?):').findall(str(content))
            url = re.compile('"permalink":"(.*?)"').findall(str(content))
            if url == []:
                print('进入休眠')
                time.sleep(7200)
                break
            for ur in url:
                try:
                    urls = str(ur).replace('\\', '')
                    # urls = 'https://www.facebook.com/groups/2337886349768125/posts/2883216531901768'
                    arrs = choice(arr)
                    time.sleep(int(arrs))
                    res = ss.get(urls, headers=headers2)
                    article = res.content.decode('utf-8')
                    articles = re.compile('"wwwURL":"(.*?)"').findall(
                        str(article))
                    times = re.compile('"creation_time":(.*?),').findall(
                        str(article))
                    likeCount = re.compile(
                        '"reaction_count":{"count":(.*?),"').findall(
                            str(article))
                    title = re.compile('"message":{"text":"(.*?)"},"').findall(
                        str(article))
                    for urs, ti, like, til in zip(articles, times, likeCount,
                                                  title):
                        try:
                            ac = ''
                            ab = re.compile('(\\\\ud...)').findall(str(til))
                            for te in ab:
                                til = til.replace(te, '')
                            if til[-1] == '\\':
                                til = til[:-1]
                            tils = til.encode(
                                'utf-8', 'replace').decode('unicode-escape')
                            urss = str(urs).replace('\\', '')
                            timeArray = time.localtime(int(ti))
                            pubTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                    timeArray)
                            arcontent = tils
                            site = "Facebook"
                            siteId = 1049117
                            pushState = 0
                            downloadTime = datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S')
                            data = []
                            data.append(
                                InsertOne({
                                    "url": urss,
                                    "title": tils,
                                    "pub_time": pubTime,
                                    "content": arcontent,
                                    "download_time": downloadTime,
                                    "site": site,
                                    "site_id": siteId,
                                    "aid": urss,
                                    'push_state': pushState,
                                    'like_num': int(like),
                                }))
                            try:
                                collection.bulk_write(data)
                                print('添加完成')
                                print('下载时间' + downloadTime)
                                print('发布时间' + pubTime)
                            except Exception as err:
                                print("添加重复")
                                print('下载时间' + downloadTime)
                                print('发布时间' + pubTime)
                        except Exception as err:
                            import traceback

                            traceback.print_exc()
                            pass
                except Exception as err:
                    import traceback

                    traceback.print_exc()
                    pass
        except Exception as err:
            import traceback

            traceback.print_exc()
            pass
コード例 #4
0
ファイル: integrate_new.py プロジェクト: kalisp/pype
    def register(self, instance):
        # Required environment variables
        anatomy_data = instance.data["anatomyData"]

        io.install()

        context = instance.context

        project_entity = instance.data["projectEntity"]

        context_asset_name = context.data["assetEntity"]["name"]

        asset_name = instance.data["asset"]
        asset_entity = instance.data.get("assetEntity")
        if not asset_entity or asset_entity["name"] != context_asset_name:
            asset_entity = io.find_one({
                "type": "asset",
                "name": asset_name,
                "parent": project_entity["_id"]
            })
            assert asset_entity, (
                "No asset found by the name \"{0}\" in project \"{1}\""
            ).format(asset_name, project_entity["name"])

            instance.data["assetEntity"] = asset_entity

            # update anatomy data with asset specific keys
            # - name should already been set
            hierarchy = ""
            parents = asset_entity["data"]["parents"]
            if parents:
                hierarchy = "/".join(parents)
            anatomy_data["hierarchy"] = hierarchy

        task_name = instance.data.get("task")
        if task_name:
            anatomy_data["task"] = task_name

        stagingdir = instance.data.get("stagingDir")
        if not stagingdir:
            self.log.info(
                ("{0} is missing reference to staging directory."
                 " Will try to get it from representation.").format(instance))

        else:
            self.log.debug(
                "Establishing staging directory @ {0}".format(stagingdir))

        # Ensure at least one file is set up for transfer in staging dir.
        repres = instance.data.get("representations")
        assert repres, "Instance has no files to transfer"
        assert isinstance(
            repres,
            (list,
             tuple)), ("Instance 'files' must be a list, got: {0} {1}".format(
                 str(type(repres)), str(repres)))

        subset = self.get_subset(asset_entity, instance)
        instance.data["subsetEntity"] = subset

        version_number = instance.data["version"]
        self.log.debug("Next version: v{}".format(version_number))

        version_data = self.create_version_data(context, instance)

        version_data_instance = instance.data.get('versionData')
        if version_data_instance:
            version_data.update(version_data_instance)

        # TODO rename method from `create_version` to
        # `prepare_version` or similar...
        version = self.create_version(subset=subset,
                                      version_number=version_number,
                                      data=version_data)

        self.log.debug("Creating version ...")

        new_repre_names_low = [_repre["name"].lower() for _repre in repres]

        existing_version = io.find_one({
            'type': 'version',
            'parent': subset["_id"],
            'name': version_number
        })

        if existing_version is None:
            version_id = io.insert_one(version).inserted_id
        else:
            # Check if instance have set `append` mode which cause that
            # only replicated representations are set to archive
            append_repres = instance.data.get("append", False)

            # Update version data
            # TODO query by _id and
            io.update_many(
                {
                    'type': 'version',
                    'parent': subset["_id"],
                    'name': version_number
                }, {'$set': version})
            version_id = existing_version['_id']

            # Find representations of existing version and archive them
            current_repres = list(
                io.find({
                    "type": "representation",
                    "parent": version_id
                }))
            bulk_writes = []
            for repre in current_repres:
                if append_repres:
                    # archive only duplicated representations
                    if repre["name"].lower() not in new_repre_names_low:
                        continue
                # Representation must change type,
                # `_id` must be stored to other key and replaced with new
                # - that is because new representations should have same ID
                repre_id = repre["_id"]
                bulk_writes.append(DeleteOne({"_id": repre_id}))

                repre["orig_id"] = repre_id
                repre["_id"] = io.ObjectId()
                repre["type"] = "archived_representation"
                bulk_writes.append(InsertOne(repre))

            # bulk updates
            if bulk_writes:
                io._database[io.Session["AVALON_PROJECT"]].bulk_write(
                    bulk_writes)

        version = io.find_one({"_id": version_id})
        instance.data["versionEntity"] = version

        existing_repres = list(
            io.find({
                "parent": version_id,
                "type": "archived_representation"
            }))

        instance.data['version'] = version['name']

        intent_value = instance.context.data.get("intent")
        if intent_value and isinstance(intent_value, dict):
            intent_value = intent_value.get("value")

        if intent_value:
            anatomy_data["intent"] = intent_value

        anatomy = instance.context.data['anatomy']

        # Find the representations to transfer amongst the files
        # Each should be a single representation (as such, a single extension)
        representations = []
        destination_list = []

        if 'transfers' not in instance.data:
            instance.data['transfers'] = []

        template_name = self.template_name_from_instance(instance)

        published_representations = {}
        for idx, repre in enumerate(instance.data["representations"]):
            published_files = []

            # create template data for Anatomy
            template_data = copy.deepcopy(anatomy_data)
            if intent_value is not None:
                template_data["intent"] = intent_value

            resolution_width = repre.get("resolutionWidth")
            resolution_height = repre.get("resolutionHeight")
            fps = instance.data.get("fps")

            if resolution_width:
                template_data["resolution_width"] = resolution_width
            if resolution_width:
                template_data["resolution_height"] = resolution_height
            if resolution_width:
                template_data["fps"] = fps

            files = repre['files']
            if repre.get('stagingDir'):
                stagingdir = repre['stagingDir']

            if repre.get("outputName"):
                template_data["output"] = repre['outputName']

            template = os.path.normpath(
                anatomy.templates[template_name]["path"])

            sequence_repre = isinstance(files, list)
            repre_context = None
            if sequence_repre:
                self.log.debug("files: {}".format(files))
                src_collections, remainder = clique.assemble(files)
                self.log.debug("src_tail_collections: {}".format(
                    str(src_collections)))
                src_collection = src_collections[0]

                # Assert that each member has identical suffix
                src_head = src_collection.format("{head}")
                src_tail = src_collection.format("{tail}")

                # fix dst_padding
                valid_files = [x for x in files if src_collection.match(x)]
                padd_len = len(valid_files[0].replace(src_head, "").replace(
                    src_tail, ""))
                src_padding_exp = "%0{}d".format(padd_len)

                test_dest_files = list()
                for i in [1, 2]:
                    template_data["representation"] = repre['ext']
                    template_data["frame"] = src_padding_exp % i
                    anatomy_filled = anatomy.format(template_data)
                    template_filled = anatomy_filled[template_name]["path"]
                    if repre_context is None:
                        repre_context = template_filled.used_values
                    test_dest_files.append(os.path.normpath(template_filled))
                template_data["frame"] = repre_context["frame"]

                self.log.debug("test_dest_files: {}".format(
                    str(test_dest_files)))

                dst_collections, remainder = clique.assemble(test_dest_files)
                dst_collection = dst_collections[0]
                dst_head = dst_collection.format("{head}")
                dst_tail = dst_collection.format("{tail}")

                index_frame_start = None

                if repre.get("frameStart"):
                    frame_start_padding = int(anatomy.templates["render"].get(
                        "frame_padding",
                        anatomy.templates["render"].get("padding")))

                    index_frame_start = int(repre.get("frameStart"))

                # exception for slate workflow
                if index_frame_start and "slate" in instance.data["families"]:
                    index_frame_start -= 1

                dst_padding_exp = src_padding_exp
                dst_start_frame = None
                for i in src_collection.indexes:
                    # TODO 1.) do not count padding in each index iteration
                    # 2.) do not count dst_padding from src_padding before
                    #   index_frame_start check
                    src_padding = src_padding_exp % i

                    src_file_name = "{0}{1}{2}".format(src_head, src_padding,
                                                       src_tail)

                    dst_padding = src_padding_exp % i

                    if index_frame_start:
                        dst_padding_exp = "%0{}d".format(frame_start_padding)
                        dst_padding = dst_padding_exp % index_frame_start
                        index_frame_start += 1

                    dst = "{0}{1}{2}".format(dst_head, dst_padding,
                                             dst_tail).replace("..", ".")

                    self.log.debug("destination: `{}`".format(dst))
                    src = os.path.join(stagingdir, src_file_name)

                    self.log.debug("source: {}".format(src))
                    instance.data["transfers"].append([src, dst])

                    published_files.append(dst)

                    # for adding first frame into db
                    if not dst_start_frame:
                        dst_start_frame = dst_padding

                # Store used frame value to template data
                template_data["frame"] = dst_start_frame
                dst = "{0}{1}{2}".format(dst_head, dst_start_frame,
                                         dst_tail).replace("..", ".")
                repre['published_path'] = dst

            else:
                # Single file
                #  _______
                # |      |\
                # |       |
                # |       |
                # |       |
                # |_______|
                #
                template_data.pop("frame", None)
                fname = files
                assert not os.path.isabs(fname), (
                    "Given file name is a full path")

                template_data["representation"] = repre['ext']

                src = os.path.join(stagingdir, fname)
                anatomy_filled = anatomy.format(template_data)
                template_filled = anatomy_filled[template_name]["path"]
                repre_context = template_filled.used_values
                dst = os.path.normpath(template_filled).replace("..", ".")

                instance.data["transfers"].append([src, dst])

                published_files.append(dst)
                repre['published_path'] = dst
                self.log.debug("__ dst: {}".format(dst))

            repre["publishedFiles"] = published_files

            for key in self.db_representation_context_keys:
                value = template_data.get(key)
                if not value:
                    continue
                repre_context[key] = template_data[key]

            # Use previous representation's id if there are any
            repre_id = None
            repre_name_low = repre["name"].lower()
            for _repre in existing_repres:
                # NOTE should we check lowered names?
                if repre_name_low == _repre["name"]:
                    repre_id = _repre["orig_id"]
                    break

            # Create new id if existing representations does not match
            if repre_id is None:
                repre_id = io.ObjectId()

            representation = {
                "_id": repre_id,
                "schema": "pype:representation-2.0",
                "type": "representation",
                "parent": version_id,
                "name": repre['name'],
                "data": {
                    'path': dst,
                    'template': template
                },
                "dependencies": instance.data.get("dependencies", "").split(),

                # Imprint shortcut to context
                # for performance reasons.
                "context": repre_context
            }

            if repre.get("outputName"):
                representation["context"]["output"] = repre['outputName']

            if sequence_repre and repre.get("frameStart"):
                representation['context']['frame'] = (
                    dst_padding_exp % int(repre.get("frameStart")))

            self.log.debug("__ representation: {}".format(representation))
            destination_list.append(dst)
            self.log.debug("__ destination_list: {}".format(destination_list))
            instance.data['destination_list'] = destination_list
            representations.append(representation)
            published_representations[repre_id] = {
                "representation": representation,
                "anatomy_data": template_data,
                "published_files": published_files
            }
            self.log.debug("__ representations: {}".format(representations))

        # Remove old representations if there are any (before insertion of new)
        if existing_repres:
            repre_ids_to_remove = []
            for repre in existing_repres:
                repre_ids_to_remove.append(repre["_id"])
            io.delete_many({"_id": {"$in": repre_ids_to_remove}})

        self.log.debug("__ representations: {}".format(representations))
        for rep in instance.data["representations"]:
            self.log.debug("__ represNAME: {}".format(rep['name']))
            self.log.debug("__ represPATH: {}".format(rep['published_path']))
        io.insert_many(representations)
        instance.data["published_representations"] = (
            published_representations)
        # self.log.debug("Representation: {}".format(representations))
        self.log.info("Registered {} items".format(len(representations)))
コード例 #5
0
def my_job():
    # menuId = ['http://www.ccdi.gov.cn/ldhd/gcsy/', 'http://www.ccdi.gov.cn/ldhd/wbld/',
    #           'http://www.ccdi.gov.cn/xxgk/hyzl/', 'http://www.ccdi.gov.cn/yaowen/', 'http://www.ccdi.gov.cn/pl/',
    #           'http://www.ccdi.gov.cn/gzdt/jdjc/', 'http://www.ccdi.gov.cn/gzdt/dfzf/', 'http://www.ccdi.gov.cn/xsxc/',
    #           'http://www.ccdi.gov.cn/gzdt/zzjs/', 'http://www.ccdi.gov.cn/gzdt/gjhz/',
    #           'http://www.ccdi.gov.cn/gzdt/jcfc/', 'http://www.ccdi.gov.cn/lswh/wenhua/',
    #           'http://www.ccdi.gov.cn/lswh/lilun/', 'http://www.ccdi.gov.cn/scdc/zggb/zjsc/',
    #           'http://www.ccdi.gov.cn/scdc/zggb/djcf/', 'http://www.ccdi.gov.cn/scdc/zyyj/zjsc/',
    #           'http://www.ccdi.gov.cn/scdc/zyyj/djcf/', 'http://www.ccdi.gov.cn/scdc/sggb/zjsc/',
    #           'http://www.ccdi.gov.cn/scdc/sggb/djcf/']
    menuId = [1, 2, 3, 4]
    for chl in menuId:
        paramssss = dict(paramsss)
        paramssss['page'] = chl
        canshushijian = time.strftime("%Y.%m.%d", time.localtime())
        paramssss[
            'was_custom_expr'] = "'((的)) AND (DocRelTime = (\' " + canshushijian + "'\'))'"
        # location = os.getcwd() + '/fake_useragent.json'
        # ua = fake_useragent.UserAgent(path=location)
        # print(ua.random)
        # headers['User-Agent'] = ua.random
        print('111')
        try:
            # pro = ips()
            # ss.proxies = pro

            response = ss.post(
                'https://www.ccdi.gov.cn/was5/web/search',
                timeout=10,
                headers=headersss,
                params=paramssss,
            )
            print(response.status_code)

            content = response.content.decode('utf-8')
            a = re.compile("<a href='(.*?)' target=\"_blank\">").findall(
                str(content))
            sj = """5,6,7,8,9"""
            sjs = set(sj.split(','))
            print("444")
            for sjss in sjs:
                sjs = sjss

            time.sleep(int(sjs))
            aaaa = 0
            for ac in a:
                str2 = filter(str.isdigit, ac)  # str2为filter对象
                str3 = list(str2)
                num = "".join(str3)
                # location = os.getcwd() + '/fake_useragent.json'
                # ua = fake_useragent.UserAgent(path=location)
                headers[
                    'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84'
                url = ac
                try:
                    res = ss.get(url,
                                 headers=headerss,
                                 timeout=10,
                                 cookies=cookiess,
                                 verify=False)
                    acontent = res.content.decode('utf-8')
                    articleContent = re.compile(
                        '<div class="TRS_Editor">([\s\S]*?)</div>').findall(
                            str(acontent))
                    if articleContent == []:
                        for neirong in range(10):
                            # location = os.getcwd() + '/fake_useragent.json'
                            # ua = fake_useragent.UserAgent(path=location)
                            # headers['User-Agent'] = ua.random
                            res = ss.get(url,
                                         headers=headers,
                                         timeout=10,
                                         cookies=cookies,
                                         verify=False)
                            acontent = res.content.decode('utf-8')
                            articleContent = re.compile(
                                '<div class="TRS_Editor">([\s\S]*?)</div>'
                            ).findall(str(acontent))
                            articleContents = re.compile(
                                '<div class="content">([\s\S]*?)</div>'
                            ).findall(str(acontent))
                            if articleContents or articleContent:
                                if articleContents:
                                    articleContent = articleContents
                                    break
                                else:
                                    break
                            else:
                                print()
                    title = re.compile(
                        '<h2 class="tit">([\s\S]*?)</h2>').findall(
                            str(acontent))
                    print(url)
                    pubTime = re.compile('发布时间:([\s\S]*?)</em>').findall(
                        str(acontent))
                    pubTimes = datetime.datetime.strptime(
                        pubTime[0], "%Y-%m-%d %H:%M")
                    imglist = re.compile(
                        '<img [\s\S]*?." src="([\s\S]*?.)" />').findall(
                            str(articleContent[0]))
                    strs = articleContent[0]
                    for im in imglist:
                        ims = im[4:10]
                        # chls = chl+ims+'/'
                        # strs = strs.replace('\\', '')
                        # 补全图片地址
                        # ims = im.replace('\\','')
                        c = urllib.parse.urljoin(url, im)

                        # 替换成已经补全的图片地址
                        # content =    re.sub(i, c, contentText)
                        strs = strs.replace(im, c)

                    if articleContent:
                        site = "中央纪律检查委员会"
                        site_id = 1049418
                        push_state = 0
                        downloadTime = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        datass = []
                        datass.append(
                            InsertOne({
                                "url": url,
                                "title": title[0],
                                "aid": num,
                                "content": strs,
                                "site": site,
                                "pub_time": pubTimes,
                                "push_state": push_state,
                                "site_id": site_id,
                                "download_Time": downloadTime
                            }))
                        insertdb(datass)
                    else:
                        pass

                except Exception as err:

                    traceback.print_exc()
                    pass
        except Exception as err:

            traceback.print_exc()
            pass
コード例 #6
0
                          {'$set': {
                              'poll': row['Poll']
                          }}))

        person = people.find_one({
            'addressId': address['_id'],
            'name': row['Name']
        })

        rowVoted = row['Voted'] == 'Y'
        if person is None:
            peopleWrites.append(
                InsertOne({
                    'name': row['Name'],
                    'addressId': address['_id'],
                    'voted': rowVoted,
                    'created': datetime.datetime.utcnow(),
                    'updated': datetime.datetime.utcnow()
                }))
        else:
            voted = person.get('Voted')
            if voted is None or voted != rowVoted:
                peopleWrites.append(
                    UpdateOne({'_id': person['_id']}, {
                        '$set': {
                            'voted': rowVoted,
                            'updated': datetime.datetime.utcnow()
                        }
                    }))

    if len(addressWrites) > 0:
コード例 #7
0
# Replace XXXX with your connection URI from the Atlas UI
client = MongoClient("mongodb+srv://analytics:[email protected]/test?retryWrites=true&w=majority")

people_raw = client.cleansing['people-raw']

batch_size = 1000
inserts = []
count = 0

    # Instead of updating one document at a time, we will add the current update
    # to a batch of updates, and when the current batch size reaches the batch
    # size limit, send the batch updates to the server at once.
with open("./people-raw.json") as dataset: 
    for line in dataset: 
        inserts.append(InsertOne(loads(line)))
        
        count += 1
                       
        if count == batch_size:
            people_raw.bulk_write(inserts)
            inserts = []
            count = 0
if inserts:         
    people_raw.bulk_write(inserts)
    count = 0

# Confirm that 50,474 documents are in your collection before moving on
people_raw.count()

コード例 #8
0
        continue

    for item in fed["banned"].items():
        user_id = item[0]
        ban = item[1]
        new = {
            "fed_id": fed["fed_id"],
            "user_id": user_id,
            "by": ban["by"],
            "time": ban["time"],
        }

        if "reason" in ban:
            new["reason"] = ban["reason"]

        if "banned_chats" in ban:
            new["banned_chats"] = ban["banned_chats"]

        queue.append(InsertOne(new))

    mongodb.fed_bans.bulk_write(queue)
    mongodb.feds.update_one({"fed_id": fed["fed_id"]},
                            {"$unset": {
                                "banned": 1
                            }})
    changed_feds += 1

log.info("Update done!")
log.info("Modified feds - " + str(changed_feds))
log.info("Unchanged feds - " + str(all_feds_count - changed_feds))
コード例 #9
0
    def handle(self, *args, **options):

        # Establish MongoDB connection
        client = settings.MONGO_CLIENT
        db = client[options['database']]
        exam_collection = db.get_collection(options['exam_collection'])
        tag_collection = db.get_collection(options['tag_collection'])

        # Test whether the uniqueness constraint is defined, create it if not (this will only happen when collection
        # first created)
        if not exam_collection.index_information().get('exam_uniqueness_constraint', None):

            exam_collection.create_index([
                ('exam_id', DESCENDING),
                ('revision', DESCENDING),
            ], unique=True, name="exam_uniqueness_constraint")

        scanners = options['scanners']
        years = options['years']
        months = options['months']
        days = options['days']

        parsed_data_path = Path(options['data'])

        if not scanners:
            scanner_paths = [scanner_path for scanner_path in parsed_data_path.iterdir() if scanner_path.is_dir()]
        else:
            scanner_paths = [scanner_path for scanner_path in parsed_data_path.iterdir()
                             if (scanner_path.is_dir() and (scanner_path.name in scanners))]

        for scanner_path in sorted(scanner_paths):

            if not years:
                year_paths = [year_path for year_path in scanner_path.iterdir() if year_path.is_dir()]
            else:
                year_paths = [year_path for year_path in scanner_path.iterdir()
                              if (year_path.is_dir() and (year_path.name in years))]

            for year_path in sorted(year_paths):

                if not months:
                    month_paths = [month_path for month_path in year_path.iterdir() if month_path.is_dir()]
                else:
                    month_paths = [month_path for month_path in year_path.iterdir()
                                   if (month_path.is_dir() and (month_path.name in months))]

                for month_path in sorted(month_paths):

                    if not days:
                        day_paths = [day_path for day_path in month_path.iterdir() if day_path.is_dir()]
                    else:
                        day_paths = [day_path for day_path in month_path.iterdir()
                                     if (day_path.is_dir() and (day_path.name in days))]

                    for day_path in sorted(day_paths):

                        for exam_id in sorted([e for e in day_path.iterdir() if e.is_dir()]):

                            for pt_dir in sorted([p for p in exam_id.iterdir() if p.is_dir()]):

                                for session_dir in sorted([s for s in pt_dir.iterdir() if s.is_dir()]):

                                    tags_to_create = []

                                    study_metadata_files = list(session_dir.glob("study_*_metadata.txt"))

                                    if not study_metadata_files:
                                        self.stdout.write("Error: No study metadata found in {}".format(session_dir))
                                        continue

                                    if len(study_metadata_files) > 1:
                                        self.stdout.write("Error: Multiple study metadata "
                                                          "files for {} found".format(session_dir))
                                        continue

                                    study_meta_file = study_metadata_files[0]

                                    if not study_meta_file.is_file():
                                        self.stdout.write("Error: Cannot load file {}".format(study_meta_file))
                                        continue

                                    self.stdout.write("Loading data from {}".format(str(study_meta_file)))

                                    try:
                                        with open(str(study_meta_file), "rt") as sm:
                                            study_metadata = json.load(sm)
                                    except ValueError:
                                        self.stdout.write("Error: Cannot load file {}".format(study_meta_file))
                                        continue

                                    metadata = study_metadata['metadata']
                                    data = study_metadata['data']

                                    dicom_data = None
                                    for subdir in data:
                                        if subdir.get('dicom_data', None):
                                            dicom_data = subdir['dicom_data']
                                            break

                                    if not dicom_data:
                                        self.stdout.write("Error: No DICOM metadata "
                                                          "for exam {}".format(study_meta_file))
                                        continue

                                    try:
                                        exam_id = metadata['exam_id']
                                        revision = 1
                                        parser_version = metadata['parser_version']
                                        filepath = metadata['gold_fpath']
                                        checksum = metadata['gold_archive_checksum']
                                    except KeyError:
                                        self.stdout.write("Error: Required metadata field not "
                                                          "available for exam {}".format(study_meta_file))
                                        continue

                                    try:
                                        station_name = get_fmrif_scanner(dicom_data["00081010"]["Value"][0])
                                    except (KeyError, IndexError):
                                        station_name = None

                                    if not station_name:
                                        station_name = filepath.split("/")[0]

                                    try:
                                        study_instance_uid = dicom_data["0020000D"]['Value'][0]
                                    except (KeyError, IndexError):
                                        study_instance_uid = None

                                    try:
                                        study_id = dicom_data["00200010"]['Value'][0]
                                    except (KeyError, IndexError):
                                        study_id = None

                                    try:
                                        study_date = dicom_data["00080020"]['Value'][0]
                                        study_date = datetime.strptime(study_date, '%Y%m%d').date()
                                    except (KeyError, IndexError):
                                        study_date = None

                                    if not study_date:
                                        year, month, day = filepath.split("/")[1:4]
                                        study_date = "{}{}{}".format(year, month, day)
                                        study_date = datetime.strptime(study_date, '%Y%m%d').date()

                                    try:
                                        study_time = dicom_data["00080030"]['Value'][0]
                                        if "." in study_time:
                                            study_time = datetime.strptime(study_time, '%H%M%S.%f').time()
                                        else:
                                            study_time = datetime.strptime(study_time, '%H%M%S').time()
                                    except (KeyError, IndexError):
                                        study_time = None

                                    if study_time:
                                        study_datetime = datetime.combine(study_date, study_time)
                                    else:
                                        study_datetime = datetime.combine(study_date, datetime_time.min)

                                    try:
                                        study_description = dicom_data["00081030"]['Value'][0]
                                    except (KeyError, IndexError):
                                        study_description = None

                                    protocol = None  # Not implemented yet

                                    try:
                                        accession_number = dicom_data["00080050"]['Value'][0]
                                    except (KeyError, IndexError):
                                        accession_number = None

                                    try:
                                        name = dicom_data["00100010"]['Value'][0]['Alphabetic']
                                    except (KeyError, IndexError):
                                        name = None

                                    if name:
                                        name_fields = parse_pn(name)
                                        last_name = name_fields['family_name']
                                        first_name = name_fields['given_name']
                                    else:
                                        first_name, last_name = None, None

                                    try:
                                        patient_id = dicom_data["00100020"]['Value'][0]
                                    except (KeyError, IndexError):
                                        patient_id = None

                                    try:
                                        sex = dicom_data["00100040"]['Value'][0]
                                    except (KeyError, IndexError):
                                        sex = None

                                    try:
                                        birth_date = dicom_data["00100030"]['Value'][0]
                                        birth_date = datetime.strptime(birth_date, '%Y%m%d')
                                    except (KeyError, IndexError):
                                        birth_date = None

                                    new_exam = {
                                        'exam_id': exam_id,
                                        'revision': revision,
                                        'parser_version': parser_version,
                                        'filepath': filepath,
                                        'checksum': checksum,
                                        'station_name': station_name,
                                        'study_instance_uid': study_instance_uid,
                                        'study_id': study_id,
                                        'study_datetime': study_datetime,
                                        'study_description': study_description,
                                        'protocol': protocol,
                                        'accession_number': accession_number,
                                        'name': name,
                                        'last_name': last_name,
                                        'first_name': first_name,
                                        'patient_id': patient_id,
                                        'sex': sex,
                                        'birth_date': birth_date,
                                    }

                                    new_exam_id = exam_collection.insert_one(new_exam).inserted_id

                                    study_data = study_metadata['data']

                                    mr_scans = []

                                    for subdir in study_data:
                                        if subdir.get('dicom_data', None):
                                            mr_scans.append(subdir)

                                    self.stdout.write("Found {} mr scans".format(len(mr_scans)))

                                    for scan in mr_scans:

                                        try:

                                            scan_dicom_data = scan['dicom_data']
                                            scan_name = scan['metadata']['gold_scan_dir']

                                        except KeyError:

                                            self.stdout.write("Error: Missing mandatory scan metadata, "
                                                              "omitting scan from exam {}".format(study_meta_file))
                                            continue

                                        for tag, attr in scan_dicom_data.items():

                                            vr = attr.get('vr', None)

                                            if not vr:
                                                self.stdout.write(
                                                    "WARNING: No VR found for tag {} in scan {} "
                                                    "of study {}. Skipping.".format(tag, scan_name,
                                                                                    study_meta_file))
                                                continue

                                            if vr in ['OB', 'OD', 'OF', 'OL', 'OV', 'OW', 'SQ', 'UN']:
                                                self.stdout.write(
                                                    "WARNING: Tag encoding of type B64 or JSON not supported "
                                                    "for querying purposes - Tag {} in scan {} "
                                                    "of study {}. Skipping.".format(tag, scan_name,
                                                                                    study_meta_file))
                                                continue

                                            try:

                                                new_tag = self.parse_attribute(new_exam_id, tag, scan_name, attr)
                                                tags_to_create.append(InsertOne(new_tag))

                                            except AttributeError:
                                                self.stdout.write(
                                                    "Attribute value exceeds indexable size. Skipping. Tag {} in "
                                                    "scan of study {}".format(tag, scan_name, study_meta_file)
                                                )

                                    try:

                                        res = tag_collection.bulk_write(tags_to_create)

                                        self.stdout.write("Inserted {} tags to collection".format(res.inserted_count))

                                    except PyMongoError as e:

                                        self.stdout.write("Error: Unable to insert scan documents "
                                                          "for day ".format(day_path))
                                        self.stdout.write(e)
                                        self.stdout.write(traceback.format_exc())
コード例 #10
0
def my_job(ac):
    try:
        while (ac <= 10):
            proxy = {}
            paramss = dict(params)
            if ac == 0:
                paramss['start'] = ""
            else:

                paramss['start'] = ac * 50
            ac = ac + 1
            agentUrl = "http://47.96.91.228:82/get/"
            res = requests.get(agentUrl)

            agenContent = res.content.decode("utf-8")
            dataip = re.compile('"proxy": "(.*?)",').findall(str(agenContent))
            ip = dataip[0]
            proxy = {
                'http://:': ip,
            }

            ss.proxies = proxy
            response = ss.get('https://www.douban.com/group/',
                              headers=headers,
                              params=paramss,
                              cookies=cookies)
            htmlContent = response.content.decode("utf-8")
            urlList = re.compile(
                '<a href="https://www.douban.com/group/topic/(.*?)/"').findall(
                    str(htmlContent))
            for i in urlList:
                url = "https://www.douban.com/group/topic/" + i
                time.sleep(10)
                urlResponse = ss.get(url, headers=headers, cookies=cookies)
                bs = BeautifulSoup(urlResponse.content,
                                   'html.parser',
                                   from_encoding='utf-8')
                htmlContents = urlResponse.content.decode("utf-8")
                title = re.compile(
                    '<h1>(\s|[\r\n])*(.*?)(\s|[\r\n])*</h1>').findall(
                        str(htmlContents))
                for ti in title:
                    titles = ti[1]
                pubTimes = re.compile('"dateCreated": "(.*?)",').findall(
                    str(htmlContents))
                pubTimes = pubTimes[0]
                pubTimes = pubTimes.replace("T", " ")
                aa = bs.select('div.rich-content.topic-richtext')
                downloadTime = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                site = "豆瓣"
                siteId = 1044573
                data = []
                articleStatue = 0
                articleContent = aa[0]
                articleContent = str(articleContent)
                data.append(
                    InsertOne({
                        "url": url,
                        "title": titles,
                        "pub_time": pubTimes,
                        "content": articleContent,
                        "download_time": downloadTime,
                        "site": site,
                        "site_id": siteId,
                        "aid": i,
                        'push_state': articleStatue
                    }))
                insertdb(data)
    except Exception as err:
        traceback.print_exc()
        pass
コード例 #11
0
    def reducer(self, key, values):
        """
        Cleans the billing data:
            -checks gaps
            -checks overlappings
            -generates daily dataframe
            -checks outliers
            -saves results to RAW DATA and HBASE
        :param key: the device
        :param values: the information
        :return:
        """
        #create dataframe with the values:
        df = pd.DataFrame.from_records(values, columns=["ts_ini", "ts_end", "value", "energytype", "source"])
        # group it by source and energyType
        source_group = df.groupby('source')

        for source, df_source_group in source_group:
            etype_group = df_source_group.groupby('energytype')
            for etype, df_etype_group in etype_group:
                df_etype_group = df_etype_group.dropna(subset=["ts_ini"])
                df_etype_group = df_etype_group.set_index('ts_ini')
                df_etype_group = df_etype_group.sort_index()
                df_etype_group['ts_ini'] = df_etype_group.index
                # save billing information in raw_data
                raw_data = df_etype_group[["ts_ini", "ts_end", "value"]].to_dict('records')
                for r in raw_data:
                    r.update({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"})

                ops = [InsertOne(x) for x in raw_data]
                result = self.mongo['raw_data'].bulk_write(
                    [
                        DeleteMany({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}),
                    ] + ops
                )
                # self.mongo['raw_data'].update({"device": key, "source": source, "energy_type": etype, "data_type": "billing"}, {'$set': {
                #         "device": key, "source": source, "energy_type": etype, "companyId": self.companyId,
                #         "raw_data":df_etype_group[["ts_ini","ts_end","value"]].to_dict('records')
                #     }
                # }, upsert=True)
                # # generate daily dataframe dividing by days:
                dfs = []
                for row in df_etype_group.iterrows():
                    index = pd.date_range(row[1]['ts_ini'], row[1]['ts_end'])
                    if index.empty:
                        continue
                    df_temp = pd.DataFrame(
                        data={"index": index, "value": [(float(row[1]['value']) / float(len(index)))] * len(index)}, index=index)
                    dfs.append(df_temp)

                #join daily df and detect overlapings and gaps
                if not dfs:
                    continue
                global_df = dfs[0]
                overlappings = []
                for df_temp in dfs[1:]:
                    overlappings.extend(global_df.index.intersection(df_temp.index).tolist())
                    global_df = global_df.append(df_temp)
                    global_df.drop_duplicates(keep='last', inplace=True)

                gaps = []
                gap_last_index = global_df[global_df.index.to_series().diff() > pd.Timedelta('1 days')].index.tolist()
                for gf in gap_last_index:
                    index = list(global_df.index).index(gf)
                    gi = list(global_df.index)[index-1]
                    gaps.append([gi,gf])

                #max_threshold = self.config['max_threshold'][etype] * 24 if 'etype' in self.config['max_threshold'] else self.config['max_threshold']['default'] * 24
                #max_outliers_bool = dc.detect_max_threshold_outliers(global_df['value'], max_threshold)
                #global_df['value'] = dc.clean_series(global_df['value'], max_outliers_bool)
                negative_values_bool = dc.detect_min_threshold_outliers(global_df['value'], 0)
                global_df['value'] = dc.clean_series(global_df['value'], negative_values_bool)
                #znorm_bool = dc.detect_znorm_outliers(global_df['value'], 30, mode="global")
                #global_df['value'] = dc.clean_series(global_df['value'], znorm_bool)

                #max_outliers = list(global_df[max_outliers_bool].index)
                negative_outliers = list(global_df[negative_values_bool].index)
                #znorm_outliers = list(global_df[znorm_bool].index)

                clean_data = global_df.to_dict('records')
                for r in clean_data:
                    r.update({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"})

                ops = [InsertOne(x) for x in clean_data]
                result = self.mongo['clean_data'].bulk_write(
                    [
                        DeleteMany({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}),
                    ] + ops
                )

                self.mongo['data_quality'].update({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"},
                                                {"$set":
                                                   {
                                                    "overlapings" : overlappings,
                                                    "gaps": gaps,
                                                    "negative_values": negative_outliers
                                                    }
                                                }, upsert=True)

                for row in global_df.iterrows():
                    yield None, "\t".join([str(row[1]['index'].timestamp()), key, str(row[1]['value']), etype, source])
コード例 #12
0
ファイル: bulk_writer.py プロジェクト: nndii/yadm
 def insert_one(self, document):
     self._batch.append(InsertOne(to_mongo(document)))
コード例 #13
0
                }
            }, {
                "$sort": SON([("total", -1)])
            }]
            cursor = input_collection.aggregate(pipeline)

            for record in cursor:
                sensor = {}
                sensor['measurement_id'] = record['_id']['measurement_id']
                sensor['loc'] = record['_id']['loc']
                sensor['description'] = record['_id']['description']
                sensor['source'] = record['_id']['source']
                sensor['gmt_s_date'] = gmt_s_date
                sensor['gmt_e_date'] = gmt_e_date

                sensors.append(InsertOne(sensor))

            if len(sensors) > 0:
                #delete old sensor records in input collection
                query = {}
                cursor_count = output_collection.find(query).count()
                if cursor_count > 0:
                    logging.info(
                        "Found {} records in collection {} from {}".format(
                            cursor_count, output_collection.name,
                            sensorSource))
                    output_collection.remove(query)
                    logging.info(
                        "Deleted {} sensors from collection {}".format(
                            sensorSource, output_collection.name))
                #store the sensors into the output collection
コード例 #14
0
 def sync_labels(self, labels):
     # Create labels
     bulk = []
     l_coll = self.mongo_db["labels"]
     current_labels = {ll["name"]: ll["_id"] for ll in l_coll.find()}
     for label in labels:
         if label in current_labels:
             bulk += [
                 UpdateOne(
                     {"_id": current_labels[label]},
                     {"$set": {setting: True for setting in labels[label]}},
                 )
             ]
         else:
             doc = {
                 # "_id": bson.ObjectId(),
                 "name": label,
                 "description": "",
                 "bg_color1": 8359053,
                 "fg_color1": 16777215,
                 "bg_color2": 8359053,
                 "fg_color2": 16777215,
                 "is_protected": False,
                 # Label scope
                 "enable_agent": False,
                 "enable_service": False,
                 "enable_serviceprofile": False,
                 "enable_managedobject": False,
                 "enable_managedobjectprofile": False,
                 "enable_administrativedomain": False,
                 "enable_authprofile": False,
                 "enable_commandsnippet": False,
                 "enable_commandsnippet": False,
                 #
                 "enable_allocationgroup": False,
                 "enable_networksegment": False,
                 "enable_object": False,
                 "enable_objectmodel": False,
                 "enable_platform": False,
                 "enable_resourcegroup": False,
                 "enable_sensorprofile": False,
                 # CRM
                 "enable_subscriber": False,
                 "enable_subscriberprofile": False,
                 "enable_supplier": False,
                 "enable_supplierprofile": False,
                 # DNS
                 "enable_dnszone": False,
                 "enable_dnszonerecord": False,
                 # IPAM
                 "enable_ipaddress": False,
                 "enable_addressprofile": False,
                 "enable_ipaddressrange": False,
                 "enable_ipprefix": False,
                 "enable_prefixprofile": False,
                 "enable_vrf": False,
                 "enable_vrfgroup": False,
                 # VC
                 "enable_vc": False,
                 "enable_vlan": False,
                 "enable_vlanprofile": False,
                 "enable_vpn": False,
                 "enable_vpnprofile": False,
                 # Exposition scope
                 "expose_metric": False,
                 "expose_datastream": False,
             }
             for setting in labels[label]:
                 doc[setting] = True
             bulk += [InsertOne(doc)]
     if bulk:
         l_coll.bulk_write(bulk, ordered=True)
コード例 #15
0
    def flush(self):
        """Insert all buffered records into the Mongo collection.

            Note:
                Log records are inserted in chronological order into the database.
                The first insert failure that occurs aborts the remaining insert
                operations. All log records inserted successfully will be removed
                from the buffer.
        """

        self.acquire()
        try:
            if not self.buffer:
                return

            bulk_result = self.client[self.database][
                self.collection].bulk_write(
                    [
                        InsertOne({
                            'datetime':
                            datetime.utcfromtimestamp(record.created),
                            'processName':
                            record.processName,
                            'processId':
                            record.process,
                            'threadName':
                            record.threadName,
                            'threadId':
                            record.thread,
                            'pathname':
                            record.pathname,
                            'filename':
                            record.filename,
                            'module':
                            record.module,
                            'funcName':
                            record.funcName,
                            'lineno':
                            record.lineno,
                            'msg':
                            record.msg,
                            'levelname':
                            record.levelname,
                            'levelno':
                            record.levelno,
                            # 'funcargs': record.args,
                        }) for record in self.buffer
                    ],
                    ordered=True)

            if not bulk_result.acknowledged:
                # Handle error here.
                pass

            self.buffer[:bulk_result.inserted_count] = []

        except pymongo.errors.BulkWriteError as bwe:
            # Handle the exception here
            # Error details can be found in bwe._OperationFailure__details

            self.buffer[:bwe.details.get('nInserted', 0)] = []

        except pymongo.errors.ConnectionFailure as cf:
            # Handle the exception here
            pass

        finally:
            self.release()
コード例 #16
0
from pymongo import InsertOne, MongoClient

from notes.config import CONN_URI

BATCH_SIZE = 1000  # Batch size for batch insertion

cli = MongoClient(CONN_URI)
people_raw = cli.cleansing['people-raw']

batch_insertions = []
with open('people-raw.json') as f:
    for line in f:
        line_dict = bson.json_util.loads(line)
        # Instead of inserting one document at a time, we will add the current
        # insertion to a batch of insertions, and when the current batch size
        # reaches the batch size limit, at once send the batch insertions to the
        # server.
        batch_insertions.append(InsertOne(line_dict))
        if len(batch_insertions) == BATCH_SIZE:
            people_raw.bulk_write(batch_insertions)
            print(f'Finished inserting a batch of {BATCH_SIZE} documents')
            batch_insertions = []
# Take care of the last batch of insertions
if batch_insertions:
    people_raw.bulk_write(batch_insertions)
    print(f'Finished inserting a last batch of {len(batch_insertions)} '
          f'documents')

print('Finished all the insertions.')
コード例 #17
0
ファイル: pipelines.py プロジェクト: lizexiong/scrapy
 def process_item(self, item, spider):
     col = self.db.mDoubaninfo
     min = InsertOne(dict(item))
     col.bulk_write([min])
     return item
コード例 #18
0
async def async_update_matches_by_protocol_no(matchengine: MatchEngine,
                                              protocol_no: str):
    """
    Update trial matches by diff'ing the newly created trial matches against existing matches in
    the db. Delete matches by adding {is_disabled: true} and insert all new matches.
    """
    matches_by_sample_id = matchengine.matches.get(protocol_no, dict())
    updated_time = datetime.datetime.now()
    for matches in matches_by_sample_id.values():
        for match in matches:
            match['_updated'] = updated_time
    if protocol_no not in matchengine.matches or protocol_no not in matchengine._trials_to_match_on:
        log.info(
            f"{matchengine.match_criteria_transform.trial_collection} {protocol_no} was not matched on, not updating {matchengine.match_criteria_transform.trial_collection} matches"
        )
        if not matchengine.skip_run_log_entry:
            matchengine.task_q.put_nowait(RunLogUpdateTask(protocol_no))
        await matchengine.task_q.join()
        return
    log.info(f"Updating matches for {protocol_no}")
    if not matchengine.drop:

        # If no matches are found, disable all match records by sample id
        if not matchengine.matches[protocol_no]:
            for chunk in chunk_list(
                    list(matchengine.
                         clinical_ids_for_protocol_cache[protocol_no]),
                    matchengine.chunk_size):
                matchengine.task_q.put_nowait(
                    UpdateTask([
                        UpdateMany(filter={
                            matchengine.match_criteria_transform.match_trial_link_id:
                            protocol_no,
                            'clinical_id': {
                                '$in': chunk
                            }
                        },
                                   update={
                                       '$set': {
                                           "is_disabled": True,
                                           '_updated': updated_time
                                       }
                                   })
                    ], protocol_no))
        else:
            # Get matches to disable and issue queries
            matches_to_disable = await get_all_except(matchengine, protocol_no,
                                                      matches_by_sample_id)
            delete_ops = await get_delete_ops(matches_to_disable, matchengine)
            matchengine.task_q.put_nowait(UpdateTask(delete_ops, protocol_no))

    for sample_id in matches_by_sample_id.keys():
        if not matchengine.drop:
            new_matches_hashes = [
                match['hash'] for match in matches_by_sample_id[sample_id]
            ]

            # get existing matches in db with identical hashes to newly found matches
            existing = await get_existing_matches(matchengine,
                                                  new_matches_hashes)
            existing_hashes = {result['hash'] for result in existing}
            disabled = {
                result['hash']
                for result in existing if result['is_disabled']
            }

            # insert new matches if they don't already exist. disable everything else
            matches_to_insert = get_matches_to_insert(matches_by_sample_id,
                                                      existing_hashes,
                                                      sample_id)
            matches_to_disable = await get_matches_to_disable(
                matchengine, new_matches_hashes, protocol_no, sample_id)

            # flip is_disabled flag if a new match generated during run matches hash of an existing
            matches_to_mark_available = [
                m for m in matches_by_sample_id[sample_id]
                if m['hash'] in disabled
            ]
            ops = get_update_operations(matches_to_disable, matches_to_insert,
                                        matches_to_mark_available, matchengine)
        else:
            ops = [
                InsertOne(document=trial_match)
                for trial_match in matches_by_sample_id[sample_id]
            ]
        matchengine.task_q.put_nowait(UpdateTask(ops, protocol_no))

    if not matchengine.skip_run_log_entry:
        matchengine.task_q.put_nowait(RunLogUpdateTask(protocol_no))
    await matchengine.task_q.join()
コード例 #19
0
def Insert_Data(data):
    insert_datas = []
    insert_datas.append(InsertOne(data))
    my_collection.bulk_write(insert_datas)
    insert_datas.clear()
コード例 #20
0
ファイル: engine.py プロジェクト: gabrielat/noc
 def sync_facts(self):
     """
     Retrieve known facts and synchronize with database
     """
     self.logger.debug("Synchronizing facts")
     # Get facts from CLIPS
     self.logger.debug("Extracting facts")
     e_facts = {}  # uuid -> fact
     try:
         f = self.env.InitialFact()
     except clips.ClipsError:
         return  # No facts
     while f:
         if f.Template and f.Template.Name in self.templates:
             self.facts[f.Index] = f
             args = {}
             for k in f.Slots.keys():
                 v = f.Slots[k]
                 if v == clips.Nil:
                     v = None
                 args[str(k)] = v
             fi = self.fcls[f.Template.Name](**args)
             e_facts[self.get_fact_uuid(fi)] = fi
         f = f.Next()
     # Get facts from database
     now = datetime.datetime.now()
     collection = ObjectFact._get_collection()
     bulk = []
     new_facts = set(e_facts)
     for f in collection.find({"object": self.object.id}):
         if f["_id"] in e_facts:
             fact = e_facts[f["_id"]]
             f_attrs = self.get_fact_attrs(fact)
             if f_attrs != f["attrs"]:
                 # Changed facts
                 self.logger.debug("Fact %s has been changed: %s -> %s",
                                   f["_id"], f["attrs"], f_attrs)
                 bulk += [
                     UpdateOne(
                         {"_id": f["_id"]},
                         {
                             "$set": {
                                 "attrs": f_attrs,
                                 "changed": now,
                                 "label": smart_text(fact)
                             }
                         },
                     )
                 ]
             new_facts.remove(f["_id"])
         else:
             # Removed fact
             self.logger.debug("Fact %s has been removed", f["_id"])
             bulk += [DeleteOne({"_id": f["_id"]})]
     # New facts
     for f in new_facts:
         fact = e_facts[f]
         f_attrs = self.get_fact_attrs(fact)
         self.logger.debug("Creating fact %s: %s", f, f_attrs)
         bulk += [
             InsertOne({
                 "_id": f,
                 "object": self.object.id,
                 "cls": fact.cls,
                 "label": smart_text(fact),
                 "attrs": f_attrs,
                 "introduced": now,
                 "changed": now,
             })
         ]
     if bulk:
         self.logger.debug("Commiting changes to database")
         try:
             collection.bulk_write(bulk)
             self.logger.debug("Database has been synced")
         except BulkWriteError as e:
             self.logger.error("Bulk write error: '%s'", e.details)
             self.logger.error("Stopping check")
     else:
         self.logger.debug("Nothing changed")
コード例 #21
0
def my_job():

    for iu in range(126):
        iu = iu + 1
        paramss = dict(params)
        paramss['page'] = iu
        response = requests.get('https://m.dutenews.com/index/ajax/content',
                                headers=headers,
                                params=paramss,
                                cookies=cookies)
        content = response.content.decode('unicode-escape')
        contents = str(content)
        url = re.compile('getUrl\((.*?),\)').findall(contents)
        url = list(set(url))
        for i in url:
            try:
                urls = 'https://plus.dutenews.com/api/v2/feeds/' + i + '/comments'
                # urls = 'https://plus.dutenews.com/api/v2/feeds/17639/comments'
                articleUrl = 'https://page.dutenews.com/H5/sns/#/imgText?id=' + i
                response = requests.get(urls)
                content = response.content.decode('utf-8')
                title = re.compile('"feed_content":"(.*?)",').findall(content)
                titles = title[0].encode('utf-8').decode('unicode_escape')
                if len(title) == 0:
                    title = re.compile('"title":"(.*?)",').findall(
                        str(content))
                pubTime = re.compile('"created_at":"(.*?)",').findall(
                    str(content))
                fmt = '%Y-%m-%dT%H:%M:%SZ'
                t = datetime.datetime.strptime(pubTime[0], fmt)
                # 东八区
                t += datetime.timedelta(hours=8)
                getobj = GetValue2(content)
                contenttext = getobj.get_values('images')
                videotext = getobj.get_values('video')
                articlecontent = ''
                if contenttext is None:
                    print()
                else:
                    imgs = ''
                    contenttexts = re.compile("'url': '(.*?)',").findall(
                        str(contenttext))
                    for im in contenttexts:

                        imgs += "<br><img src=\'" + im + "\'></img>"
                    articlecontent += titles + imgs
                if videotext is None:
                    print()
                else:
                    videos = ''
                    aa = GetValue2(videotext['resource'])
                    videourl = aa.get_values('url')
                    videos += "<br><video src='" + videourl + "' controls=" "></video>"
                    articlecontent += titles + videos
                site = "读特-鹏友圈"
                siteId = 1048212
                data = []
                articleStatue = 0
                downloadTime = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                data.append(
                    InsertOne({
                        "url": articleUrl,
                        "title": titles,
                        "pub_time": t,
                        "content": articlecontent,
                        "download_time": downloadTime,
                        "site": site,
                        "site_id": siteId,
                        "aid": i,
                        'push_state': articleStatue,
                    }))
                insertdb(data)
            except Exception as err:
                import traceback
                traceback.print_exc()
                pass
コード例 #22
0
    def importReference(self,filename):
    
        self.logg.info('Starting Reference Import')
        
        if filename == None:
            self.logg.info('Filename Not Set | Beginning File Search')
            fileDir = os.path.dirname(os.path.abspath(__file__))
            regex = re.compile('^Geography\_\d{8}\_\B(to)_\d{8}\_\B(from)\_\d{8}(.txt.gz)')

            for root, dirs, files in os.walk(fileDir):
                for file in files:
                    if regex.match(file):
                        self.logg.info('Compatible File Found')
                        filename = os.path.join(os.path.abspath(root), file)
                        
        # Reference File is .tsv
        self.logg.info('Starting Reading Reference File')
        my_cols = ["TYPE","A", "B", "C", "D", "E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]

        # Initialise docType Counters
        totalImportedDocuments = [0,0,0,0,0,0,0,0]
        bulkCounter = 0

        # Open file & read into pd.DataFrame
        with gzip.open(filename, "rt", encoding="cp1252") as f:
            df = pd.read_csv(f, sep="\t",names=my_cols)
            
            self.logg.info('Importing Reference into MongoDB')

            operations = []

            for row in df.itertuples(index=False):
                # Reset out_file
                out_file = None

                if row[0] == 'PIF':
                    # REFTYPE : Document Specification
                    out_file = {
                        "docType": "PIF",
                        "fileVersion": row[1],
                        "sourceSystem": row[2],
                        "TOCid": row[3],
                        "timetableStartDate": row[4],
                        "timetableEndDate": row[5],
                        "cycleType": row[6],
                        "cycleStage": row[7],
                        "creationDate": row[8],
                        "fileSequenceNumber": row[9],
                    }
                    totalImportedDocuments[0] += 1
                elif row[0] == "REF":
                    # REFTYPE : Reference Code
                    out_file = {
                        "docType": "REF",
                        "actionCode": row[1],
                        "codeType": row[2],
                        "description": row[3]
                    }
                    totalImportedDocuments[1] += 1
                elif row[0] == "TLD":
                    # REFTYPE : Timing Load
                    out_file = {
                        "docType": "TLD",
                        "actionCode": row[1],
                        "tractionType": row[2],
                        "trailingLoad": row[3],
                        "speed": row[4],
                        "raGauge": row[5],
                        "description": row[6],
                        "ITPSPowerType": row[7],
                        "ITPSLoad": row[8],
                        "limitingSpeed": row[9],
                    }
                    totalImportedDocuments[2] += 1
                elif row[0] == "LOC":
                    # REFTYPE : Geographical Data
                    out_file = {
                        "docType": "LOC",
                        "actionCode": row[1],
                        "TIPLOC": row[2],
                        "locationName": row[3],
                        "startDate": row[4],
                        "endDate": row[5],
                        "northing": row[6],
                        "easting": row[7],
                        "timingType": row[8],
                        "zone": row[9],
                        "STANOX": row[10],
                        "offNetwork": row[11],
                        "forceLPB": row[12],
                    }
                    totalImportedDocuments[3] += 1
                elif row[0] == "PLT":
                    # REFTYPE : Platform
                    out_file = {
                        "docType": "PLT",
                        "actionCode": row[1],
                        "locationCode": row[2],
                        "platformID": row[3],
                        "startDate": row[4],
                        "endDate": row[5],
                        "length": row[6],
                        "powerSupplyType": row[7],
                        "DDOPassenger": row[8],
                        "DDONonPassenger": row[9],
                    }            
                    totalImportedDocuments[4] += 1
                elif row[0] == "NWK":
                    # REFTYPE : Network Link
                    out_file = {
                        "docType": "NWK",
                        "actionCode": row[1],
                        "originLocation": row[2],
                        "destinationLocation": row[3],
                        "lineCode": row[4],
                        "lineDescription": row[5],
                        "startDate": row[6],
                        "endDate": row[7],
                        "initialDirection": row[8],
                        "finalDirection": row[9],
                        "DDOPassenger": row[10],
                        "DDONonPassenger": row[11],
                        "RETB": row[12],
                        "zone": row[13],
                        "reversible": row[14],
                        "powerSupplyType": row[15],
                        "RA": row[16],
                        "maxTrainLength": row[17],
                    }
                    totalImportedDocuments[5] += 1
                elif row[0] == "TLK":
                    # REFTYPE : Timing Link
                    out_file = {
                        "docType": "NWK",
                        "actionCode": row[1],
                        "originLocation": row[2],
                        "destinationLocation": row[3],
                        "lineCode": row[4],
                        "tractionType": row[5],
                        "trailingLoad": row[6],
                        "speed": row[7],
                        "RA": row[8],
                        "entrySpeed": row[9],
                        "exitSpeed": row[10],
                        "startDate": row[11],
                        "endDate": row[12],
                        "secRunTime": row[13],
                        "description": row[14],
                    }
                    totalImportedDocuments[6] += 1
                else:
                    out_file = {
                        "docType": "DEL"
                    }
                    totalImportedDocuments[7] += 1
                    
                # Append Copy of Dictionary to List
                operations.append(
                            InsertOne(out_file.copy())
                        )

                if ((sum(totalImportedDocuments) % self.inst['standard-bulk-size']) == 0):
                    try:
                        
                        self.mongodb['reference'].bulk_write(operations)

                    except BulkWriteError as bwe:
                        self.logg.error(bwe.details)
                    else:
                        # Reset Bulk Storage
                        operations = []
                        # Increment Counter
                        bulkCounter +=1

                        self.logg.info('REFERENCE Progress | {:.0%} | {} Inserts'.format(sum(totalImportedDocuments)/1.2e6, bulkCounter))  
                    

        self.logg.info('Completed Reference Import Successfully')          
コード例 #23
0
    def _refresh_object(cls, managed_object):
        from noc.sa.models.managedobject import ManagedObject

        def to_dict(v):
            return dict((r["profile"], r["summary"]) for r in v)

        def to_list(v):
            return [{"profile": k, "summary": v[k]} for k in sorted(v)]

        if hasattr(managed_object, "id"):
            managed_object = managed_object.id
        coll = ServiceSummary._get_collection()
        bulk = []
        # Get existing summary
        old_summary = dict((x["interface"], x) for x in coll.find(
            {"managed_object": managed_object}, {
                "_id": 1,
                "interface": 1,
                "service": 1,
                "subscriber": 1
            },
            comment=
            "[servicesummary._refresh_object] Refresh summary of services for managed object"
        ))
        # Get actual summary
        new_summary = ServiceSummary.build_summary_for_object(managed_object)
        # Merge summaries
        for iface in old_summary:
            if iface not in new_summary:
                # Stale, delete
                bulk += [DeleteOne({"_id": old_summary[iface]["_id"]})]
                continue
            oi = old_summary[iface]
            old_services = to_dict(oi["service"])
            old_subs = to_dict(oi["subscriber"])
            ni = new_summary[iface]
            if old_services != ni["service"] or old_subs != ni["subscriber"]:
                # Changed, update
                bulk += [
                    UpdateOne({"_id": oi["_id"]}, {
                        "$set": {
                            "service": to_list(ni["service"]),
                            "subscriber": to_list(ni["subscriber"])
                        }
                    })
                ]
            # Mark as processed
            del new_summary[iface]
        # Process new items
        bulk += [
            InsertOne({
                "managed_object": managed_object,
                "interface": iface,
                "service": to_list(new_summary[iface]["service"]),
                "subscriber": to_list(new_summary[iface]["subscriber"])
            }) for iface in new_summary
        ]
        if bulk:
            logger.info("Committing changes to database")
            try:
                r = coll.bulk_write(bulk, ordered=False)
                logger.info("Database has been synced")
                logger.info("Modify: %d, Deleted: %d", r.modified_count,
                            r.deleted_count)
            except BulkWriteError as e:
                logger.error("Bulk write error: '%s'", e.details)
                logger.error("Stopping check")
        mo = ManagedObject.get_by_id(managed_object)
        mo.segment.update_summary()
コード例 #24
0
ファイル: snapshot.py プロジェクト: shinn1982/vista
    def post(self):

        try:
            args = self.parser.parse(self.params_check, request)
            request_body = request.get_json()

            snapshot_name = args['snapshot_name']
            operation = args['operation']

            if operation == "import":
                filter_dict = {"snapshot_name": snapshot_name}
                if check_duplicated(current_app.mongo.db.snapshot,
                                    **filter_dict):
                    return JsonRes(
                        info={},
                        usr_err_mes=
                        "Duplicated Snapshot name, please change another name!",
                        status=False,
                        data=[],
                        code=400)

                # create snapshot instance
                snapshot_instance = {'snapshot_name': snapshot_name}
                # snapshot_res = current_app.mongo.db.snapshot.insert_one(snapshot_instance)
                current_app.mongo.db.snapshot.insert_one(snapshot_instance)

                # create topo instances
                topo_instance = {'snapshot_name': snapshot_name}
                if 'nodes' in request_body.keys():
                    topo_instance['nodes'] = request_body['nodes']
                    # topo_instance.update({
                    #     'nodes': request_body['nodes']
                    # })
                if 'links' in request_body.keys():
                    topo_instance['links'] = request_body['links']
                    # topo_instance.update({
                    #     'links': request_body['links']
                    # })
                # topo_res = current_app.mongo.db.topo.insert_one(topo_instance)
                current_app.mongo.db.topo.insert_one(topo_instance)

                # create policy instances
                if 'sr_policy' in request_body.keys():
                    sr_policies = request_body['sr_policy']
                    req_list = []
                    for policy in sr_policies:
                        policy_instance = {'snapshot_name': snapshot_name}
                        policy_instance.update(policy)
                        req_list.append(InsertOne(policy_instance))
                    # policies_res = current_app.mongo.db.policy.bulk_write(req_list)
                    current_app.mongo.db.policy.bulk_write(req_list)
                #
                # create global_params instances
                if 'global_params' in request_body.keys():
                    params_instance = {'snapshot_name': snapshot_name}
                    params_instance.update(request_body['global_params'])
                    # params_res = current_app.mongo.db.params.insert_one(params_instance)
                    current_app.mongo.db.params.insert_one(params_instance)

                return JsonRes(data={}, info={}, code=201)

            if operation == "export":

                if not self.snapshot_validate(snapshot_name):
                    return JsonRes(
                        info={},
                        usr_err_mes=
                        "No Snapshot named %s is found! Please import snapshot first!"
                        % snapshot_name,
                        err_code=0,
                        status=False,
                        data=[])

                topo_res = current_app.mongo.db.topo.find_one(
                    {'snapshot_name': snapshot_name})
                del topo_res['snapshot_name']

                policy_res = current_app.mongo.db.policy.find(
                    {'snapshot_name': snapshot_name})
                policy_list = []
                for policy in policy_res:
                    del policy['snapshot_name']
                    del policy['_id']
                    policy_list.append(policy)

                res = {
                    'nodes': topo_res['nodes'],
                    'links': topo_res['links'],
                    'sr_policy': policy_list
                }

                if 'with_params' in args.keys() and args['with_params']:
                    params_res = current_app.mongo.db.params.find_one(
                        {'snapshot_name': snapshot_name})
                    del params_res['snapshot_name']
                    del params_res['_id']

                    res.update({'global_params': params_res})

                return JsonRes(data=res, info={}, code=200)

        except Exception as e:
            LOG.debug(e)
            return JsonRes(info=e,
                           usr_err_mes="MongoDB Update Exception!",
                           status=False,
                           data=[],
                           code=400)
コード例 #25
0
sum = 0
for docs in coll.find():
    sum = sum + int(doc['price'])

print('Total Sales Price ${:,}'.format(sum))

# Bulk writing
from pymongo import InsertOne, DeleteOne, ReplaceOne

db = client.new_dump_db
coll = db.num_coll

# upsert will input the new object even if it does not find the one it is
# replacing
requests = [
    InsertOne({'Binel': 100}),
    DeleteOne({'Binel': 100}),
    ReplaceOne({'Binel': 100}, {'Ben': 1000}, upsert=True)
]

results = coll.bulk_write(requests)

print('Final writes are: ', results.inserted_count)

# Deletions based on criteria
# Need to re-assign the origional db though for this to work

db = client.new_db

coll = db.db_collection
コード例 #26
0
mongo_url = inifile.get('con', 'mongo_url')
max_data_num = int(inifile.get('insert', 'bulk_max_count'))
wtime_out_millsec = int(inifile.get('insert',
                                    'w_concern_repl_timeout_milisec'))
write_concern_opt = inifile.get('insert', 'w_concern_opt')

client = MongoClient(mongo_url)
timestamp = datetime.now().strftime("%Y/%m/%d %H:%M:%S%f")
db = client.repltestdb

print "@@@ Inserting bulk data %s with write_concern ..." % (max_data_num)
coll = db.get_collection('testcol01',
                         write_concern=WriteConcern(
                             w=write_concern_opt, wtimeout=wtime_out_millsec))
try:
    coll.bulk_write([
        InsertOne({
            "timestamp": timestamp,
            'id': i
        }) for i in range(max_data_num)
    ])
    print "OK."

except BulkWriteError as bwe:
    pprint(bwe.details)

data_count = db.testcol01.find({}).count()
print "count = %s" % data_count

print "Done!"
コード例 #27
0
async def importfbans_func(message, fed, strings, document=None):
    global user_id
    file_type = os.path.splitext(document["file_name"])[1][1:]

    if file_type == "json":
        if document["file_size"] > 1000000:
            await message.reply(strings["big_file_json"].format(num="1"))
            return
    elif file_type == "csv":
        if document["file_size"] > 52428800:
            await message.reply(strings["big_file_csv"].format(num="50"))
            return
    else:
        await message.reply(strings["wrong_file_ext"])
        return

    f = await bot.download_file_by_id(document.file_id, io.BytesIO())
    msg = await message.reply(strings["importing_process"])

    data = None
    if file_type == "json":
        try:
            data = rapidjson.load(f).items()
        except ValueError:
            return await message.reply(strings["invalid_file"])
    elif file_type == "csv":
        data = csv.DictReader(io.TextIOWrapper(f))

    real_counter = 0

    queue_del = []
    queue_insert = []
    current_time = datetime.now()
    for row in data:
        if file_type == "json":
            user_id = row[0]
            data = row[1]
        elif file_type == "csv":
            if "user_id" in row:
                user_id = int(row["user_id"])
            elif "id" in row:
                user_id = int(row["id"])
            else:
                continue
        else:
            raise NotImplementedError

        new = {"fed_id": fed["fed_id"], "user_id": user_id}

        if "reason" in row:
            new["reason"] = row["reason"]

        if "by" in row:
            new["by"] = int(row["by"])
        else:
            new["by"] = message.from_user.id

        if "time" in row:
            new["time"] = datetime.fromtimestamp(int(row["time"]))
        else:
            new["time"] = current_time

        if "banned_chats" in row and type(row["banned_chats"]) is list:
            new["banned_chats"] = row["banned_chats"]

        queue_del.append(
            DeleteMany({
                "fed_id": fed["fed_id"],
                "user_id": user_id
            }))
        queue_insert.append(InsertOne(new))

        if len(queue_insert) == 1000:
            real_counter += len(queue_insert)

            # Make delete operation ordered before inserting.
            if queue_del:
                await db.fed_bans.bulk_write(queue_del, ordered=False)
            await db.fed_bans.bulk_write(queue_insert, ordered=False)

            queue_del = []
            queue_insert = []

    # Process last bans
    real_counter += len(queue_insert)
    if queue_del:
        await db.fed_bans.bulk_write(queue_del, ordered=False)
    if queue_insert:
        await db.fed_bans.bulk_write(queue_insert, ordered=False)

    await msg.edit_text(strings["import_done"].format(num=real_counter))
コード例 #28
0
ファイル: feds.py プロジェクト: annihilatorrrr/HitsukiX
async def importfbans_func(message, fed, strings, document=None):
    global user_id
    file_type = os.path.splitext(document['file_name'])[1][1:]

    if file_type == 'json':
        if document['file_size'] > 1000000:
            await message.reply(strings['big_file_json'].format(num='1'))
            return
    elif file_type == 'csv':
        if document['file_size'] > 52428800:
            await message.reply(strings['big_file_csv'].format(num='50'))
            return
    else:
        await message.reply(strings['wrong_file_ext'])
        return

    f = await bot.download_file_by_id(document.file_id, io.BytesIO())
    msg = await message.reply(strings['importing_process'])

    data = None
    if file_type == 'json':
        try:
            data = rapidjson.load(f).items()
        except ValueError:
            return await message.reply(strings['invalid_file'])
    elif file_type == 'csv':
        data = csv.DictReader(io.TextIOWrapper(f))

    real_counter = 0

    queue_del = []
    queue_insert = []
    current_time = datetime.now()
    for row in data:
        if file_type == 'json':
            user_id = row[0]
            data = row[1]
        elif file_type == 'csv':
            if 'user_id' in row:
                user_id = int(row['user_id'])
            elif 'id' in row:
                user_id = int(row['id'])
            else:
                continue
        else:
            raise NotImplementedError

        new = {'fed_id': fed['fed_id'], 'user_id': user_id}

        if 'reason' in row:
            new['reason'] = row['reason']

        if 'by' in row:
            new['by'] = int(row['by'])
        else:
            new['by'] = message.from_user.id

        if 'time' in row:
            new['time'] = datetime.fromtimestamp(int(row['time']))
        else:
            new['time'] = current_time

        if 'banned_chats' in row and type(row['banned_chats']) is list:
            new['banned_chats'] = row['banned_chats']

        queue_del.append(
            DeleteMany({
                'fed_id': fed['fed_id'],
                'user_id': user_id
            }))
        queue_insert.append(InsertOne(new))

        if len(queue_insert) == 1000:
            real_counter += len(queue_insert)

            # Make delete operation ordered before inserting.
            if queue_del:
                await db.fed_bans.bulk_write(queue_del, ordered=False)
            await db.fed_bans.bulk_write(queue_insert, ordered=False)

            queue_del = []
            queue_insert = []

    # Process last bans
    real_counter += len(queue_insert)
    if queue_del:
        await db.fed_bans.bulk_write(queue_del, ordered=False)
    if queue_insert:
        await db.fed_bans.bulk_write(queue_insert, ordered=False)

    await msg.edit_text(strings['import_done'].format(num=real_counter))
コード例 #29
0
    partners = ['Xi', 'Moon', 'Xi', 'Moon', 'Trump', 'Xi']

    for i in range(6):
        events.append({
            'date': dates[i],
            'loc': locs[i],
            'partner': partners[i]
        })
    return events


if __name__ == '__main__':
    events = get_events()
    # 事件信息放入MongoDB数据库
    requests_ = [
        InsertOne({
            '_id':
            hash(i['date'] + i['loc'] + i['partner'] +
                 str(random.randint(0, 100))),
            'event':
            i
        }) for i in tqdm(events)
    ]
    try:
        result = db.event_list.bulk_write(requests_)
        pprint(result.bulk_api_result)
    except BulkWriteError as bwe:
        pprint(bwe.details)
    client.close()
コード例 #30
0
def my_job():
    for i in range(25):
        print("222")
        try:
            cc = i + 1
            urlss = "http://www.wangdaisj.com/forum-41-" + str(cc) + ".html"
            response = requests.get(urlss,
                                    headers=headers,
                                    cookies=cookies,
                                    verify=False)
            print("333")
            content = response.content.decode("utf-8")
            url = re.compile('<a href="thread(.*?)"').findall(str(content))
            for ur in url:
                print("444")
                urls = "http://www.wangdaisj.com/thread" + ur
                print(urls)
                responses = requests.get(urls,
                                         headers=headers,
                                         cookies=cookies,
                                         verify=False)
                print(responses.status_code)
                articleContent = responses.content.decode("utf-8")
                title = re.compile(
                    '<meta name="keywords" content="(.*?)" />').findall(
                        str(articleContent))
                print(title)
                bs = BeautifulSoup(articleContent, 'html.parser')
                print('666')
                canshu = re.compile(
                    '<div id="post_(\d{1,10})" class="[\s\S]*?."').findall(
                        str(articleContent))
                print('7777')
                site = "一线生活-深圳社区"
                siteId = 1046280
                push_state = 0
                for cs in canshu:
                    try:
                        print("555")
                        con = bs.find_all(attrs={'id': 'post_' + cs})
                        cons = con[0]
                        pubTieme = re.compile(
                            '发表于 <span title="(.*?)">').findall(str(cons))
                        if pubTieme:
                            print("111")
                        else:
                            pubTieme = re.compile('发表于 (.*?)</em>').findall(
                                str(cons))
                        onlyId = pubTieme[0] + cs
                        contents = re.compile(
                            '<div class="t_fsz">([\s\S]*?.)</div>').findall(
                                str(cons))
                        downloadTime = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        data = []
                        data.append(
                            InsertOne({
                                "url": urls,
                                "title": title[0],
                                "aid": cs,
                                "content": contents[0],
                                "site": site,
                                "pub_time": pubTieme[0],
                                "only_id": onlyId,
                                "push_state": push_state,
                                "site_id": siteId,
                                "download_Time": downloadTime
                            }))
                        insertdb(data)
                    except Exception as err:
                        import traceback
                        print(urls)
                        traceback.print_exc()
                        pass
                    finally:
                        client.close()
        except Exception as err:
            import traceback
            print(urls)
            traceback.print_exc()
            pass