def test_bson_regex(self): # Invalid Python regex, though valid PCRE. bson_re1 = Regex(r'[\w-\.]') self.assertEqual(r'[\w-\.]', bson_re1.pattern) self.assertEqual(0, bson_re1.flags) doc1 = {'r': bson_re1} doc1_bson = b('\x11\x00\x00\x00' # document length '\x0br\x00[\\w-\\.]\x00\x00' # r: regex '\x00') # document terminator self.assertEqual(doc1_bson, BSON.encode(doc1)) self.assertEqual(doc1, BSON(doc1_bson).decode(compile_re=False)) # Valid Python regex, with flags. re2 = re.compile('.*', re.I | re.L | re.M | re.S | re.U | re.X) bson_re2 = Regex('.*', re.I | re.L | re.M | re.S | re.U | re.X) doc2_with_re = {'r': re2} doc2_with_bson_re = {'r': bson_re2} doc2_bson = b("\x12\x00\x00\x00" # document length "\x0br\x00.*\x00ilmsux\x00" # r: regex "\x00") # document terminator self.assertEqual(doc2_bson, BSON.encode(doc2_with_re)) self.assertEqual(doc2_bson, BSON.encode(doc2_with_bson_re)) # Built-in re objects don't support ==. Compare pattern and flags. self.assertEqual(re2.pattern, BSON(doc2_bson).decode()['r'].pattern) self.assertEqual(re2.flags, BSON(doc2_bson).decode()['r'].flags) self.assertEqual(doc2_with_bson_re, BSON(doc2_bson).decode(compile_re=False))
def test_bson_regex(self): # Invalid Python regex, though valid PCRE. bson_re1 = Regex(r'[\w-\.]') self.assertEqual(r'[\w-\.]', bson_re1.pattern) self.assertEqual(0, bson_re1.flags) doc1 = {'r': bson_re1} doc1_bson = ( b'\x11\x00\x00\x00' # document length b'\x0br\x00[\\w-\\.]\x00\x00' # r: regex b'\x00') # document terminator self.assertEqual(doc1_bson, BSON.encode(doc1)) self.assertEqual(doc1, BSON(doc1_bson).decode()) # Valid Python regex, with flags. re2 = re.compile(u'.*', re.I | re.M | re.S | re.U | re.X) bson_re2 = Regex(u'.*', re.I | re.M | re.S | re.U | re.X) doc2_with_re = {'r': re2} doc2_with_bson_re = {'r': bson_re2} doc2_bson = ( b"\x11\x00\x00\x00" # document length b"\x0br\x00.*\x00imsux\x00" # r: regex b"\x00") # document terminator self.assertEqual(doc2_bson, BSON.encode(doc2_with_re)) self.assertEqual(doc2_bson, BSON.encode(doc2_with_bson_re)) self.assertEqual(re2.pattern, BSON(doc2_bson).decode()['r'].pattern) self.assertEqual(re2.flags, BSON(doc2_bson).decode()['r'].flags)
def test_bson_classes(self): _id = '5a918f9fa08bff9c7688d3e1' for a, b in [ (Binary(b'foo'), Binary(b'foo')), (Code('foo'), Code('foo')), (Code('foo', {'x': 1}), Code('foo', {'x': 1})), (DBRef('coll', 1), DBRef('coll', 1)), (DBRef('coll', 1, 'db'), DBRef('coll', 1, 'db')), (Decimal128('1'), Decimal128('1')), (MaxKey(), MaxKey()), (MinKey(), MinKey()), (ObjectId(_id), ObjectId(_id)), (Regex('foo', 'i'), Regex('foo', 'i')), (Timestamp(1, 2), Timestamp(1, 2)), ]: # Basic case. self.assertTrue( Matcher(Command(y=b)).matches(Command(y=b)), "MockupDB %r doesn't equal itself" % (b, )) # First Command argument is special, try comparing the second also. self.assertTrue( Matcher(Command('x', y=b)).matches(Command('x', y=b)), "MockupDB %r doesn't equal itself" % (b, )) # In practice, users pass PyMongo classes in message specs. self.assertTrue( Matcher(Command(y=b)).matches(Command(y=a)), "PyMongo %r != MockupDB %r" % (a, b)) self.assertTrue( Matcher(Command('x', y=b)).matches(Command('x', y=a)), "PyMongo %r != MockupDB %r" % (a, b))
def inner_match_fields(_fields, _matched_fields, _reference=None): """ Get list of matched fields inside the reference fields Args: _fields: list of referenced fields _matched_fields: list of text regex from the pipeline builder _reference: reference object to which the reference field refers Returns: list of fields where the regex matched """ for regex_ in possible_regex_list: try: runtime_regex = Regex(regex_, 'ims').try_compile() except Exception: runtime_regex = regex_ for field in _fields: try: res = runtime_regex.findall(str(field.get('value'))) if len(res) > 0: inner_value = _reference if _reference else field # removing duplicated from list if inner_value not in _matched_fields: _matched_fields.append(inner_value) if field['type'] == 'ref': inner_match_fields(field['reference']['summaries'], _matched_fields, field) if field['type'] == 'ref-section-field': inner_match_fields(field['references']['fields'], _matched_fields, field) except Exception: continue
def test_exception_wrapping(self): # No matter what exception is raised while trying to decode BSON, # the final exception always matches InvalidBSON and the original # traceback is preserved. # Invalid Python regex, though valid PCRE. # Causes an error in re.compile(). bad_doc = BSON.encode({'r': Regex(r'[\w-\.]')}) try: decode_all(bad_doc) except InvalidBSON: exc_type, exc_value, exc_tb = sys.exc_info() # Original re error was captured and wrapped in InvalidBSON. self.assertEqual(exc_value.args[0], 'bad character range') # Traceback includes bson module's call into re module. for filename, lineno, fname, text in traceback.extract_tb(exc_tb): if filename.endswith('re.py') and fname == 'compile': # Traceback was correctly preserved. break else: self.fail('Traceback not captured') else: self.fail('InvalidBSON not raised')
def find_match_fields(result: R, possible_regex_list=None): """ Get list of matched fields inside the searchresult Args: result: Generic search result possible_regex_list: list of text regex from the pipeline builder Returns: list of fields where the regex matched """ matched_fields = [] fields = result.fields if not possible_regex_list: return None for regex_ in possible_regex_list: try: runtime_regex = Regex(regex_, 'imsx').try_compile() except Exception: runtime_regex = regex_ for field in fields: try: res = runtime_regex.findall(str(field.get('value'))) if len(res) > 0: matched_fields.append(field) except Exception: continue if len(matched_fields) > 0: return matched_fields return None
async def get_flows_and_count_db( *, current_page: int, page_size: int, sorter: str = None, flow_name: str, language: str, updated_at: list[date], triggered_counts: list[int]) -> (list[FlowSchemaDb], int): if updated_at: updated_at_start, updated_at_end = updated_at db_key = [(f"name", { "$ne": None }), (f"name", Regex(f".*{escape(flow_name)}.*", "i") if flow_name else ...), (f"triggered_count", { "$gte": triggered_counts[0], "$lte": triggered_counts[1] } if triggered_counts else ...), ("is_active", True), ("updated_at", { "$gte": make_timezone_aware(updated_at_start), "$lte": make_timezone_aware(updated_at_end) } if updated_at else ...)] query = form_query(db_key) flows = await get_flows_db(current_page=current_page, page_size=page_size, sorter=sorter, query=query) total = await get_flows_count_db(query=query) return flows, total
def test_regex_pickling(self): reg = Regex(".?") pickled_with_3 = (b'\x80\x04\x959\x00\x00\x00\x00\x00\x00\x00\x8c\n' b'bson.regex\x94\x8c\x05Regex\x94\x93\x94)\x81\x94}' b'\x94(\x8c\x07pattern\x94\x8c\x02.?\x94\x8c\x05flag' b's\x94K\x00ub.') self.round_trip_pickle(reg, pickled_with_3)
def query_nstart(self, value): # DOESN'T START WITH if isinstance(value, list): value = value[0] return MongoQuery( {self.field: { '$not': Regex('^' + value + '.*', 'i') }})
def query_nlike(self, value): # WILDCARD NOT CONTAINS if isinstance(value, list): value = value[0] return MongoQuery( {self.field: { '$not': Regex('.*' + value + '.*', 'i') }})
def query_nend(self, value): # DOESN'T END WITH if isinstance(value, list): value = value[0] return MongoQuery( {self.field: { '$not': Regex('*.' + value + '$', 'i') }})
async def get_portal_user(username: str): """ # Retrieve the correct portal user :return: """ query = {"username": Regex(f"^{username}$", "i"), "is_active": True} async for user in portal_user_collection.find(query): return user
def test_jsonify_Regex(self): regex = Regex("bb|[^b]{2}") json = {'a': 1, 'regex': regex} safe_json = {'a': 1, 'regex': {'$regex': "bb|[^b]{2}", "$options": ""}} jsonified_bson = jsonify(json).response jsonified = flask_jsonify(safe_json).response assert jsonified_bson == jsonified
def extract_condition(val): if isinstance(val, dict): for code, sval in val.items(): if sval[0] == '/': val[code] = Regex(sval[1:-1]) return Condition(key, val) elif val == 1: return Condition(key, modifier='exists') elif val == 0: return Condition(key, modifier='not_exists')
def test_regex_comparison(self): re1 = Regex('a') re2 = Regex('b') self.assertNotEqual(re1, re2) re1 = Regex('a', re.I) re2 = Regex('a', re.M) self.assertNotEqual(re1, re2) re1 = Regex('a', re.I) re2 = Regex('a', re.I) self.assertEqual(re1, re2)
def remove_attachment_id_from_flow(flow_collection: Collection, url: str) -> None: """ Remove attachment_id from old collection and update url with new bucket """ filename = os.path.split(url)[-1] query = {"flow.data.url": Regex(f".*{re.escape(filename)}$", "i")} docs = flow_collection.find(query) for doc in docs: flows = doc['flow'] for flow in flows: if flow['type'] in ['image', 'video' ] and flow['data']['url'].endswith(filename): flow['data']['url'] = url flow['data'].pop('attachment_id', None) flow_collection.replace_one({"_id": doc['_id']}, doc)
def find(self, page_size, page_index, keyword=None): reg = None cursor = None total = 0 if keyword is not None: reg = Regex(r'%s' % keyword) cursor = self.db.worklog\ .find({'$or': [{'title': reg}, {'content': reg}]})\ .sort("creationdate", -1)\ .skip((page_index - 1) * page_size)\ .limit(page_size) total = self.db.worklog\ .find({'$or': [{'title': reg}, {'content': reg}]})\ .count() else: cursor = self.db.worklog.find().sort("creationdate", -1).skip( (page_index - 1) * page_size).limit(page_size) total = self.db.worklog.count() return cursor, total
async def update_message_db(message_item: UpdateMessageResponse, current_user: CurrentUserSchema, language: str = 'EN') -> str: query = {"_id": ObjectId(message_item.id)} result1 = result2 = result3 = 0 # add selected answer to message if it's not same with original response/graded response message_from_db = await message_collection.find_one(query) graded_response = message_from_db.get('adminportal', {}).get('answer') original_response = message_from_db.get('chatbot', {}).get('qnid') response = graded_response or original_response if not graded_response and response == message_item.new_response: return 'No questions updated' updated_info_query = { "updated_at": get_local_datetime_now(), "updated_by": ObjectId(current_user.userId), } # add graded response to message set_message_query = updated_info_query | { "adminportal.graded": True, "adminportal.answer": ObjectId(message_item.new_response) } result1 = await message_collection.update_one(query, {'$set': set_message_query}) # delete variation from main question and add variation to new question query = { "_id": ObjectId(response), "alternate_questions.text": Regex(f"^{escape(message_item.text)}$", "i"), "is_active": True } if question_db := await question_collection.find_one( query): # remove variation if found match for idx, v in enumerate(question_db['alternate_questions']): if v['text'].lower() == message_item.text.lower(): question_db['alternate_questions'].pop(idx) question_db |= updated_info_query result2 = await question_collection.replace_one( {"_id": question_db['_id']}, question_db) break
def references(self, object_: CmdbObject, filter: dict, limit: int, skip: int, sort: str, order: int, user: UserModel = None, permission: AccessControlPermission = None, *args, **kwargs) \ -> IterationResult[CmdbObject]: query = [] if isinstance(filter, dict): query.append(filter) elif isinstance(filter, list): query += filter query.append( Builder.lookup_(_from='framework.types', _local='type_id', _foreign='public_id', _as='type')) query.append(Builder.unwind_({'path': '$type'})) field_ref_query = { 'type.fields.type': 'ref', '$or': [{ 'type.fields.ref_types': Regex(f'.*{object_.type_id}.*', 'i') }, { 'type.fields.ref_types': object_.type_id }] } section_ref_query = { 'type.render_meta.sections.type': 'ref-section', 'type.render_meta.sections.reference.type_id': object_.type_id } query.append( Builder.match_(Builder.or_([field_ref_query, section_ref_query]))) query.append(Builder.match_({'fields.value': object_.public_id})) return self.iterate(filter=query, limit=limit, skip=skip, sort=sort, order=order, user=user, permission=permission)
def show_symbols(path): path = re.escape(path) data = "" return_data = "" query = QueryDocument( Condition( tag='191', subfields={'a': Regex('^' + path)}, ), ) print(f" the query is -- {query.to_json()}") bibset = BibSet.from_query(query, projection={'191': True}, skip=0, limit=0) a_res_en = [] for bib in bibset.records: bib_value = bib.get_value('191', 'a') a_res_en.append(bib.get_value('191', 'a')) return_data = sorted([quote(doc) for doc in a_res_en], key=lambda x: int(''.join(c for c in x if c.isdigit()))) #return_data=a_res_en return (jsonify(return_data))
def references(self, object_: CmdbObject, filter: dict, limit: int, skip: int, sort: str, order: int, user: UserModel = None, permission: AccessControlPermission = None, *args, **kwargs) \ -> IterationResult[CmdbObject]: query = [] if isinstance(filter, dict): query.append(filter) elif isinstance(filter, list): query += filter query.append({ '$lookup': { 'from': 'framework.types', 'localField': 'type_id', 'foreignField': 'public_id', 'as': 'type' } }) query.append({'$unwind': {'path': '$type'}}) query.append({ '$match': { 'type.fields.type': 'ref', '$or': [{ 'type.fields.ref_types': Regex(f'.*{object_.type_id}.*', 'i') }, { 'type.fields.ref_types': object_.type_id }] } }) query.append({'$match': {'fields.value': object_.public_id}}) return self.iterate(filter=query, limit=limit, skip=skip, sort=sort, order=order, user=user, permission=permission)
def test_regex_hash(self): self.assertRaises(TypeError, hash, Regex('hello'))
async def get_conversations_and_count_db(*, current_page: int, page_size: int, tags: list[str] = None, search_query: str = ''): conversations = [] db_key = [ ("$addFields", { "fullname": { "$concat": ["$first_name", " ", "$last_name"] } }), ("$match", { "tags": { "$all": tags } } if tags else ...), ("$match", { "fullname": Regex(f".*{escape(search_query)}.*", "i") } if search_query else ...), ] pipeline = form_pipeline(db_key) total = await bot_user_pipeline_count(pipeline=pipeline[:]) extra_stages = [ { "$sort": SON([("last_active.received_at", -1)]) }, { "$skip": (current_page - 1) * page_size }, { "$limit": page_size }, { "$lookup": { "from": "message", "localField": "last_active.received_message_id", "foreignField": "_id", "as": "last_message" } }, { "$unwind": { "path": "$last_message", "preserveNullAndEmptyArrays": False } }, # {"$lookup": { # "from": "message", # "localField": "message.chatbot.convo_id", # "foreignField": "chatbot.convo_id", # "as": "conversations" # }}, # {"$addFields": { # "last_message": { # "$last": "$conversations" # } # }}, { "$project": { "_id": 1, "fullname": 1, "first_name": 1, "last_name": 1, "email": 1, "gender": 1, "profile_pic_url": 1, "last_active": 1, "created_at": 1, "chatbot": 1, "tags": 1, "platforms": 1, "last_message": 1 } } ] pipeline.extend(extra_stages) cursor = bot_user_collection.aggregate(pipeline) async for conversation in cursor: entry = ConversationBotUserSchema(**bot_user_helper(conversation)) if entry.last_message: entry.last_message = format_message_to_display(entry.last_message) conversations.append(entry) return conversations, total
def query_end(self, value): # ENDS WITH if isinstance(value, list): value = value[0] return MongoQuery({self.field: Regex('*.' + value + '$', 'i')})
def query_start(self, value): # STARTS WITH if isinstance(value, list): value = value[0] return MongoQuery({self.field: Regex('^' + value + '.*', 'i')})
def show_txt(path): '''displays the text of the document ''' data = "" return_data = "" doc_list = [] #path=quote(path) path = re.escape(path) ''' i2 = urllib.parse.quote(i.encode("utf-8")) #need to deal with special characters in each url uu2 = urllib.parse.urljoin(uu, i2) #create url ''' print(f" this is compiled path -- {'^' + str(path)+'$'}") doc_list = list( txts_coll.find({"doc_sym": { "$regex": "^" + str(path) + "$" }})) if len(doc_list) == 0 and path != 'favicon.ico': print(f"no exact DS {str(path)} - generating one") bib_value = '' #doc_list=list(txts_coll.find({"doc_sym":{"$regex":path}})) ''' extract text from DB''' #build list of tuples (striped_doc_sum, url to the pdf in s3) query = QueryDocument( Condition(tag='191', subfields={'a': Regex('^' + path + '$')})) #) print(f" the imp query is -- {query.to_json()}") bibset = BibSet.from_query(query, skip=0, limit=3) a_res_en = [] if bibset.count == 1: for bib in bibset.records: bib_value = bib.get_value('191', 'a') a_res_en.append( (bib.get_value('191', 'a'), 'http://' + ''.join(bib.files('EN')))) print(a_res_en) for url in a_res_en: #txt_name = url.split('/')[-1] #url is a tuple ; url[0] is a DS; url[1] is a s3 link to the pdf txt_name = url[0] # e.g. ARES721 #txt_name = txt_name.split('.')[0] +'.txt' #txt_name = txt_name +'.txt' #txt_loc='\\txts\\'+txt_name if len(url[1]) > 10: print(f" - - the {url[0]} is {url[1]} - -") pdf = PDFExtract(url[1]) parsed = parser.from_buffer( pdf.get_txt_from_url(url[1])) print(f"0----PDFExtract----0") txt = Txt(bib.get_value('191', 'a')) print(txt.set_txt(parsed["content"])) txt.title = bib.get_value('245', 'a') #txt.title=bib.get_value('239','a') ''' load text into txts''' if txt.txt is not None: query = {"doc_sym": txt.symbol} txts_coll.replace_one(query, txt.to_bson(), upsert=True) doc_list = [] doc_list = list( txts_coll.find({"doc_sym": { "$regex": "^" + str(path) + "$" }})) print(f" this is compiled path -- {'^' + str(path)+'$'}") if len(doc_list) == 1: print(f"-- it's a hit- 1") if doc_list[0]['doc_sym'][0] != 'S': return_data = doc_list[0]['raw_txt'] else: #for SC docs - temporary measure doc_1 = doc_list[0].pop('_id') return_data = doc_list[0] elif len(doc_list) > 1: print(f"-- it's a hit- many") return_data = sorted([doc['doc_sym'] for doc in doc_list], key=lambda x: int(''.join(c for c in x if c.isdigit()))) #return_data=sorted(["<a href="+doc['doc_sym']+">" for doc in doc_list]) #return_data=sorted([url_for('/'+doc_list[0]['raw_txt']) for doc in doc_list]) if return_data == "": return jsonify('text with document symbol:%s was not found' % path) #return(render_template('ds.html', data=return_data)) #print(return_data) return jsonify(return_data)
def get_emitente(self): self.set_query({"_t": Regex(u".*Emitente.*", "i")}) return self.return_one()
TypeTestCase(Int64, "long", Int64(13)), TypeTestCase(str, "string", "foo"), TypeTestCase(float, "double", 3.14), TypeTestCase(Decimal, "decimal", Decimal("3.14159265359")), TypeTestCase( Decimal, "decimal", "3.14159265359" ), # TODO split tests for odmantic type inference TypeTestCase(Decimal128, "decimal", Decimal128(Decimal("3.14159265359"))), TypeTestCase(Dict, "object", {"foo": "bar", "fizz": {"foo": "bar"}}), TypeTestCase(bool, "bool", False), TypeTestCase(Pattern, "regex", re.compile(r"^.*$")), TypeTestCase(Pattern, "regex", re.compile(r"^.*$", flags=re.IGNORECASE)), TypeTestCase( Pattern, "regex", re.compile(r"^.*$", flags=re.IGNORECASE | re.MULTILINE) ), TypeTestCase(Regex, "regex", Regex(r"^.*$", flags=32)), TypeTestCase(ObjectId, "objectId", ObjectId()), TypeTestCase(bytes, "binData", b"\xf0\xf1\xf2"), TypeTestCase(Binary, "binData", Binary(b"\xf0\xf1\xf2")), TypeTestCase(datetime, "date", sample_datetime), TypeTestCase(List, "array", ["one"]), # Compound Types TypeTestCase(Tuple[str, ...], "array", ("one",)), # type: ignore TypeTestCase(List[ObjectId], "array", [ObjectId() for _ in range(5)]), TypeTestCase( Union[Tuple[ObjectId, ...], None], # type: ignore "array", tuple(ObjectId() for _ in range(5)), ), ]
from bson import Regex from dlx import DB from dlx.marc import BibSet, QueryDocument, Condition from config import Config DB.connect(Config.connect_string) query = QueryDocument(Condition(tag='191', modifier='exists'), Condition(tag='269', subfields={'a': Regex('^1975')})) print(query.to_json()) bibset = BibSet.from_query(query, projection={'191': True}, skip=0, limit=0) print('There are {} results'.format(bibset.count)) bibset.cache() for bib in bibset.records: print('id: {}, symbol: {}'.format(bib.id, bib.get_value('191', 'a'))) print(bibset.to_xml())
database = Client(config.PROD).connect() # Год, по которому вести поиск в базе year = 2018 collection = database[f"claims_{year}"] result_file = open(f'IPGU01001_{year}.csv', "w+") pipeline = [ { u"$match": { u"service": { u"$exists": True }, u"service.srguServicePassportId": { u"$not": Regex(u".*_444$", "i") }, u"senderCode": u"IPGU01001" } }, # { # u"$limit": 100.0 # }, { u"$group": { u"_id": u"$service.srguServicePassportId", u"count": { u"$sum": 1.0 } } }