def player(): logging.basicConfig(level=logging.INFO) with open(PID_FN, "w") as f: pid = getpid() f.write(str(pid)) coll = MongoClient()["for-music-player"].queue i = 0 while True: i += 1 if not coll.count_documents(filter=FILTER): print(f"> queue empty. wait {WAIT_SEC} sec...") # exit() sleep(WAIT_SEC) elif MAX_ITERATION_NUM >= 0 and i >= MAX_ITERATION_NUM: exit() else: objs = coll.find(filter=FILTER, sort=[("date", 1)]) obj = objs[0] f, fn = mkstemp(suffix=".mp3") coll.update_one({"_id": obj["_id"]}, {"$set": { "start": datetime.now() }}) myexec(f"wget \"{obj['path']}\" -O \"{fn}\"") myexec(f"{PLAY_AUDIO_COMMAND} \"{fn}\"") close(f) coll.update_one({"_id": obj["_id"]}, {"$set": { "played": True, "end": datetime.now() }})
def main(): tags = ['gabby', 'dataset1', 'dataset2', 'matthew', 'food science'] collection = MongoClient(DATABASE_URL).abstracts.all for tag in tags: count = collection.count_documents({'tags': tag}) print(f'{tag}: {count}')
def subscribe(self, msg, args): """ Subscribe to repository notifications. It takes only one mandatory argument: the respository URL (must be public HTTP or HTTPS). """ # Validate arguments: only the first in URL valid format url = self.validURL(args) if url: collection = MongoClient(host='mongodb').chat0ps.subscriptions # First check: if repository exists repository = {"repository": url} if collection.count_documents(repository) >= 1: # Second check: if user is subscribed to this repository subscription = { "repository": url, "subscribers": msg.frm.person } subscriptions = collection.count_documents(subscription) if subscriptions >= 1: # No need to update collection. yield "Repository already subscribed." else: # Add user to subscribers list collection.update( repository, {"$push": { "subscribers": msg.frm.person }}) yield "Done. You may now set repository webhook to: http://35.198.17.35/publish" else: # If the repository doesn't exists, it's time to create # and subscribe the user to it in a single command. # Note a little difference here: "subscribers" ust be a list. subscription = { "repository": url, "subscribers": [msg.frm.person] } collection.insert_one(subscription) yield "Done." else: yield "Please inform a valid URL."
def update_service_list(services_list): kerberus_services = MongoClient()['iacon']['kerberus_services'] for service in services_list: if 'mongo' in service and kerberus_services.count_documents( {"_id": ObjectId(service['mongo'])}, limit=1) > 0: kerberus_services.update_one({"_id": ObjectId(service['mongo'])}, {"$set": service}) else: kerberus_services.insert_one(service) service['mongo'] = str(service['_id']) del service['_id'] with codecs.open("kerberus.json", "w+", "utf8") as file: file.write(json.dumps(services_list, indent=4, ensure_ascii=False))
def subscriptions(self, msg, args): """ List all repository subscriptions. """ collection = MongoClient(host='mongodb').chat0ps.subscriptions document = {"subscribers": msg.frm.person} subscriptions = collection.count_documents(document) # Check if there is at least one subscription if subscriptions >= 1: # Yes, there is. Time to list them all. for subscription in collection.find(document): yield subscription["repository"] else: yield "Sorry, no subscribed repository."
class CardManager(metaclass=Singleton): def __init__(self): username = getenv('MONGO_USERNAME') password = getenv('MONGO_PASSWORD') self.cards = MongoClient( f'mongodb://{username}:{password}@mongo:27017/').db.cards def add(self, chat_id: int, question, answer) -> None: card = { 'chat_id': chat_id, 'question': { 'from_chat_id': question['chat']['id'], 'message_id': question['message_id'] }, 'answer': { 'from_chat_id': answer['chat']['id'], 'message_id': answer['message_id'] }, 'level': 0, 'deadline': datetime.now().timestamp() } self.cards.insert_one(card) def count(self): return self.cards.count_documents({}) def update_level(self, card, value): card['level'] = max( 0, min(card['level'] + value, len(LEVEL_DEADLINES) - 1)) card['deadline'] = (datetime.now() + LEVEL_DEADLINES[card['level']]).timestamp() self.cards.update_one({'_id': card['_id']}, {'$set': card}, upsert=False) def current_top_for(self, chat_id: int) -> Dict: return self.cards.find({ 'chat_id': chat_id }).sort(key_or_list='deadline', direction=1)[0]
def unsubscribe(self, msg, args): """ Unsubscribe to repository notifications It takes only one mandatory argument: the repository URL. """ url = self.validURL(args) if url: collection = MongoClient(host='mongodb').chat0ps.subscriptions # Check: if subscription exists repository = {"repository": url} document = {"repository": url, "subscribers": msg.frm.person} subscriptions = collection.count_documents(document) if subscriptions >= 1: collection.update(repository, {"$pop": { "subscribers": msg.frm.person }}) yield "Done" else: yield "Sorry, you're not subscribedto this repository." else: yield "Please inform a valid URL."
class CrossrefAsyncCollector(object): logging.basicConfig(level=logging.INFO) def __init__(self, email: None, mongo_uri_std_cits=None): self.email = email if mongo_uri_std_cits: try: self.persist_mode = 'mongo' mongo_col = uri_parser.parse_uri(mongo_uri_std_cits).get( 'collection') if not mongo_col: mongo_col = MONGO_STDCITS_COLLECTION self.standardizer = MongoClient( mongo_uri_std_cits).get_database().get_collection( mongo_col) total_docs = self.standardizer.count_documents({}) logging.info( 'There are {0} documents in the collection {1}'.format( total_docs, mongo_col)) except ConnectionError as e: logging.error('ConnectionError %s' % mongo_uri_std_cits) logging.error(e) else: self.persist_mode = 'json' file_name_results = 'crossref-results-' + str( time.time()) + '.json' self.path_results = os.path.join(DIR_DATA, file_name_results) def extract_attrs(self, article: Article): """ Extrai os atributos de todas as referências citadas de um documento. :param article: documento do qual serão extraídos os atributos das referências citadas :return: dicionário de ids de citações e respectivos atributos """ cit_id_to_attrs = {} if article.citations: for cit in article.citations: if cit.publication_type == 'article': cit_id = self.mount_id(cit, article.collection_acronym) cit_attrs = {} if self.persist_mode == 'json': cit_attrs = self._extract_cit_attrs(cit) elif self.persist_mode == 'mongo': cit_data = self.standardizer.find_one({'_id': cit_id}) if not cit_data or not cit_data.get('crossref'): cit_attrs = self._extract_cit_attrs(cit) if cit_attrs: cit_id_to_attrs[cit_id] = cit_attrs return cit_id_to_attrs def _extract_cit_attrs(self, cit: Citation): """ Extrai os atributos de uma referência citada necessários para requisitar metadados CrossRef. :param cit: referência citada :return: dicionário de atributos para consulta no serviço CrossRef """ if cit.doi: valid_doi = preprocess_doi(cit.doi) if valid_doi: return {'doi': valid_doi} attrs = {} if cit.first_author: first_author_surname = cit.first_author.get('surname', '') cleaned_author_surname = preprocess_author_name( first_author_surname) if cleaned_author_surname: attrs.update({'aulast': cleaned_author_surname}) journal_title = cit.source if journal_title: cleaned_journal_title = preprocess_journal_title(journal_title) if cleaned_journal_title: attrs.update({'title': cleaned_journal_title}) publication_date = html.unescape( cit.publication_date) if cit.publication_date else None if publication_date and len(publication_date) >= 4: publication_year = publication_date[:4] if publication_year.isdigit(): attrs.update({'data': publication_year}) volume = html.unescape(cit.volume) if cit.volume else None if volume: attrs.update({'volume': volume}) issue = html.unescape(cit.issue) if cit.issue else None if issue: attrs.update({'issue': issue}) first_page = html.unescape(cit.first_page) if cit.first_page else None if first_page: attrs.update({'spage': first_page}) if attrs: return attrs def parse_crossref_openurl_result(self, text): """ Converte response.text para JSON com metadados obtidos do endpoint OPENURL. :param response: resposta de requisição em formato de texto :return: JSON com metadados obtidos do serviço CrossRef """ try: raw = xmltodict.parse(text) for v in raw.get('doi_records', {}).values(): metadata = v.get('crossref') if metadata and 'error' not in metadata.keys(): owner = v.get('@owner') if owner: metadata.update({'owner': owner}) timestamp = v.get('@timestamp') if timestamp: metadata.update({'timestamp': timestamp}) journal_article = metadata.get('journal', {}).get( 'journal_article', {}) if 'citation_list' in journal_article: journal_article.__delitem__('citation_list') return metadata except ExpatError as e: logging.warning("ExpatError {0}".format(text)) logging.warning(e) def parse_crossref_works_result(self, raw_metadata): """ Limpa dicionário de metadados obtidos do endpoint WORKS. Remove campo de referências :param raw_metadata: resposta de requisição em formato de dicionário :return: JSON com metadados obtidos do serviço Crossref """ raw_status = raw_metadata.get('status', '') if raw_status == 'ok': metadata = raw_metadata.get('message') if metadata: if 'reference' in metadata: metadata.__delitem__('reference') return metadata def mount_id(self, cit: Citation, collection: str): """ Monta o identificador de uma referência citada. :param cit: referência citada :param collection: coleção em que a referência foi citada :return: código identificador da citação """ cit_id = cit.data['v880'][0]['_'] return '{0}-{1}'.format(cit_id, collection) def save_crossref_metadata(self, id_to_metadata: dict): """ Persiste os metadados da referência citada. :param id_to_metadata: dicionário com id da referência citada e seus respectivos metadados Crossref """ if self.persist_mode == 'json': with open(self.path_results, 'a') as f: json.dump(id_to_metadata, f) f.write('\n') elif self.persist_mode == 'mongo': self.standardizer.update_one( filter={'_id': id_to_metadata['_id']}, update={ '$set': { 'crossref': id_to_metadata['crossref'], 'update-date': datetime.now().strftime('%Y-%m-%d') } }, upsert=True) async def run(self, citations_attrs: dict): sem = asyncio.Semaphore(CROSSREF_SEMAPHORE_LIMIT) tasks = [] async with ClientSession(headers={'mailto:': self.email}) as session: for cit_id, attrs in citations_attrs.items(): if 'doi' in attrs: url = CROSSREF_URL_WORKS.format(attrs['doi']) mode = 'doi' else: url = CROSSREF_URL_OPENURL for k, v in attrs.items(): if k != 'doi': url += '&' + k + '=' + v url += '&pid=' + self.email url += '&format=unixref' url += '&multihit=false' mode = 'attrs' task = asyncio.ensure_future( self.bound_fetch(cit_id, url, sem, session, mode)) tasks.append(task) responses = asyncio.gather(*tasks) await responses async def bound_fetch(self, cit_id, url, semaphore, session, mode): async with semaphore: await self.fetch(cit_id, url, session, mode) async def fetch(self, cit_id, url, session, mode): try: async with session.get(url) as response: try: logging.info('Collecting metadata for %s' % cit_id) if mode == 'doi': raw_metadata = await response.json(content_type=None) if raw_metadata: metadata = self.parse_crossref_works_result( raw_metadata) else: raw_metadata = await response.text() if raw_metadata: metadata = self.parse_crossref_openurl_result( raw_metadata) if metadata: id_to_metadata = {'_id': cit_id, 'crossref': metadata} self.save_crossref_metadata(id_to_metadata) except JSONDecodeError as e: logging.warning('JSONDecodeError: %s' % cit_id) logging.warning(e) except TimeoutError as e: logging.warning('TimeoutError [INNER]: %s' % cit_id) logging.warning(e) except ContentTypeError as e: logging.warning('ContentTypeError: %s' % cit_id) logging.warning(e) except ServerDisconnectedError as e: logging.warning('ServerDisconnectedError: %s' % cit_id) logging.warning(e) except TimeoutError as e: logging.warning('TimeoutError [OUTER]: %s' % cit_id) logging.warning(e) except ClientConnectorError as e: logging.warning('ClientConectorError: %s' % cit_id) logging.warning(e)
class Standardizer: logging.basicConfig(level=logging.INFO) def __init__(self, path_db, use_exact=False, use_fuzzy=False, mongo_uri_std_cits=None): self.use_exact = use_exact self.use_fuzzy = use_fuzzy if mongo_uri_std_cits: try: self.persist_mode = 'mongo' mongo_col = uri_parser.parse_uri(mongo_uri_std_cits).get( 'collection') if not mongo_col: mongo_col = MONGO_STDCITS_COLLECTION self.standardizer = MongoClient( mongo_uri_std_cits).get_database().get_collection( mongo_col) total_docs = self.standardizer.count_documents({}) logging.info( 'There are {0} documents in the collection {1}'.format( total_docs, mongo_col)) except ConnectionError as e: logging.error('ConnectionError %s' % mongo_uri_std_cits) logging.error(e) else: self.persist_mode = 'json' file_name_results = 'std-results-' + str(time.time()) + '.json' self.path_results = os.path.join(DIR_DATA, file_name_results) if path_db: logging.info('Loading %s' % path_db) self.db = self.load_database(path_db) def add_hifen_issn(self, issn: str): """ Insere hífen no ISSN. :param issn: ISSN sem hífen :return: ISSN com hífen """ if issn: return issn[:4] + '-' + issn[4:] def load_database(self, path_db: str): """ Carrega na memória o arquivo binário das bases de correção e validação. :param path_db: caminho do arquivo binário :return: base carregada em formato de dicionário """ try: with open(path_db, 'rb') as f: return pickle.load(f) except FileNotFoundError: logging.error('File {0} does not exist'.format(path_db)) def extract_issnl_from_valid_match(self, valid_match: str): """ Extrai ISSN-L a partir de uma chave ISSN-ANO-VOLUME. Caso o ISSN não exista no dicionário issn-to-issnl, considera o próprio ISSN como ISSN-L. :param valid_match: chave validada no formato ISSN-ANO-VOLUME :return: ISSN-L """ issn, year, volume = valid_match.split('-') issnl = self.db['issn-to-issnl'].get(issn, '') if not issnl: issnl = issn return issnl def extract_issn_year_volume_keys(self, cit: Citation, issns: set): """ Extrai chaves ISSN-YEAR-VOLUME para uma referência citada e lista de ISSNs. :param cit: referência citada :param issns: set de possíveis ISSNs :return: set de chaves ISSN-ANO-VOLUME """ keys = set() cit_year = cit.publication_date if cit_year: if len(cit_year) > 4: cit_year = cit_year[:4] if len(cit_year) == 4 and cit_year.isdigit(): cit_vol = cit.volume if cit_vol and cit_vol.isdigit(): for i in issns: keys.add('-'.join([i, cit_year, cit_vol])) return keys, VOLUME_IS_ORIGINAL else: for i in issns: cit_vol_inferred = self.infer_volume(i, cit_year) if cit_vol_inferred: keys.add('-'.join([i, cit_year, cit_vol_inferred])) return keys, VOLUME_IS_INFERRED return keys, VOLUME_NOT_USED def get_issns(self, matched_issnls: set): """ Obtém todos os ISSNs associados a um set de ISSN-Ls. :param matched_issnls: ISSN-Ls casados para uma dada referência citada :return: set de ISSNs vinculados aos ISSNL-s """ possible_issns = set() for mi in matched_issnls: possible_issns = possible_issns.union( set([ j for j in self.db['issnl-to-data'].get(mi, {}).get( 'issns', []) ])) return possible_issns def get_status(self, match_mode: str, mount_mode: int, db_used: str): """ Obtém o status com base no modo de casamento, de volume utilizado e de base de validação utilizada. :param match_mode: modo de casamento ['exact', 'fuzzy'] :param mount_mode: modo de obtenção da chave de validação ['VOLUME_IS_ORIGINAL', VOLUME_IS_INFERRED'] :param db_used: base de validação utilizada ['lr', 'lr-ml1', 'default'] :return: código de status conforme método utilizado """ if mount_mode == VOLUME_IS_ORIGINAL: if match_mode == 'exact': if db_used == 'lr': return STATUS_EXACT_VALIDATED_LR elif db_used == 'lr-ml1': return STATUS_EXACT_VALIDATED_LR_ML1 elif db_used == 'default': return STATUS_EXACT_VALIDATED else: if db_used == 'lr': return STATUS_FUZZY_VALIDATED_LR elif db_used == 'lr-ml1': return STATUS_FUZZY_VALIDATED_LR_ML1 elif db_used == 'default': return STATUS_FUZZY_VALIDATED elif mount_mode == VOLUME_IS_INFERRED: if match_mode == 'exact': if db_used == 'lr': return STATUS_EXACT_VOLUME_INFERRED_VALIDATED_LR elif db_used == 'lr-ml1': return STATUS_EXACT_VOLUME_INFERRED_VALIDATED_LR_ML1 elif db_used == 'default': return STATUS_EXACT_VOLUME_INFERRED_VALIDATED else: if db_used == 'lr': return STATUS_FUZZY_VOLUME_INFERRED_VALIDATED_LR elif db_used == 'lr-ml1': return STATUS_FUZZY_VOLUME_INFERRED_VALIDATED_LR_ML1 elif db_used == 'default': return STATUS_FUZZY_VOLUME_INFERRED_VALIDATED def infer_volume(self, issn: str, year: str): """ Infere o volume de um periódico a partir de issn-to-equation. :param issn: issn para o qual o volume será inferido :return: str do volume inferido arredondado para valor inteiro (se volume inferido for maior que 0) """ equation = self.db['issn-to-equation'].get(issn) if equation: a, b, r2 = equation volume = a + (b * int(year)) if volume > 0: return str(round(volume)) def match_exact(self, journal_title: str): """ Procura journal_title de forma exata no dicionário title-to-issnl. :param journal_title: título do periódico citado :return: set de ISSN-Ls associados de modo exato ao título do periódico citado """ return self.db['title-to-issnl'].get(journal_title, set()) def match_fuzzy(self, journal_title: str): """ Procura journal_title de forma aproximada no dicionário title-to-issnl. :param journal_title: título do periódico citado :return: set de ISSN-Ls associados de modo aproximado ao título do periódico citado """ matches = set() words = journal_title.split(' ') # Para a comparação ser possível, é preciso que o título tenha pelo menos MIN_CHARS_LENGTH letras e seja # formado por pelo menos MIN_WORDS_COUNT palavras. if len(journal_title) > MIN_CHARS_LENGTH and len( words) >= MIN_WORDS_COUNT: # O título oficial deve iniciar com a primeira palavra do título procurado pattern = r'[\w|\s]*'.join([word for word in words]) + '[\w|\s]*' title_pattern = re.compile(pattern, re.UNICODE) # O título oficial deve iniciar com a primeira palavra do título procurado for official_title in [ ot for ot in self.db['title-to-issnl'].keys() if ot.startswith(words[0]) ]: if title_pattern.fullmatch(official_title): matches = matches.union( self.db['title-to-issnl'][official_title]) return matches def mount_id(self, cit: Citation, collection: str): """ Monta o identificador de uma referência citada. :param cit: referência citada :param collection: coleção em que a referência foi citada :return: código identificador da citação """ cit_id = cit.data['v880'][0]['_'] return '{0}-{1}'.format(cit_id, collection) def mount_standardized_citation_data(self, status: int, key=None, issn_l=None): """ Consulta issn_l (oriundo de key ou de issn_l) no dicionário issnl-to-data para formar a estrutura normalizada da referencia citada. Monta estrutura normalizada da referencia citada, conforme os campos a seguir: cit-id: identificador da referência citada (str) issn-l: ISSN-Link do periódico citado (str) issns: ISSNs associados ao ISSN-L (list de strs) official-journal-title: títulos oficiais do periódico citado (str) official-abbreviated-journal-title: títulos abreviados oficiais do periódico citado (lista de str) alternative-journal-title: títulos alternativos do periódico citado (lista de str) status: código indicador do méetodo para normalizar update-date: data de normalização :param cit: referência citada :param status: código indicador do método aplicado para normalizar :param key: chave da qual o issn-l é extraído e buscado na base de correção :param issn_l: issn-l a ser buscado na base de correção :return: dicionário composto por pares chave-valor de dados normalizados """ if not issn_l: issn_l = self.extract_issnl_from_valid_match(key) attrs = self.db['issnl-to-data'][issn_l] data = { 'issn-l': self.add_hifen_issn(issn_l), 'issn': [self.add_hifen_issn(i) for i in attrs['issns']], 'official-journal-title': attrs['main-title'], 'official-abbreviated-journal-title': attrs['main-abbrev-title'], 'alternative-journal-titles': attrs['alternative-titles'], 'status': status, 'update-date': datetime.now().strftime('%Y-%m-%d') } return data def save_standardized_citations(self, std_citations: dict): """ Persiste as referências citadas normalizadas. :param std_citations: dicionário de referências citadas normalizadas """ if self.persist_mode == 'json': with open(self.path_results, 'a') as f: json.dump(std_citations, f) f.write('\n') elif self.persist_mode == 'mongo': for v in std_citations.values(): self.standardizer.update_one(filter={'_id': v['_id']}, update={'$set': v}, upsert=True) def get_citation_mongo_status(self, cit_id: str): """ Obtém o status atual de normalização da referência citada. :param cit_id: id da referência citada :return: status atual de normalização da referência citada """ if self.persist_mode == 'mongo': cit_standardized = self.standardizer.find_one({'_id': cit_id}) if cit_standardized: return cit_standardized.get('status', STATUS_NOT_NORMALIZED) return STATUS_NOT_NORMALIZED def validate_match(self, keys, use_lr=False, use_lr_ml1=False): """ Valida chaves ISSN-ANO-VOLUME nas bases de validação :param keys: chaves em formato ISSN-ANO-VOLUME :param use_lr: valida com dados de regressão linear de ISSN-ANO-VOLUME :param use_lr_ml1: valida com dados de regressão linear de ISSN-ANO-VOLUME mais ou menos 1 :return: chaves validadas """ valid_matches = set() if use_lr: validating_base = self.db['issn-year-volume-lr'] elif use_lr_ml1: validating_base = self.db['issn-year-volume-lr-ml1'] else: validating_base = self.db['issn-year-volume'] for k in keys: if k in validating_base: valid_matches.add(k) return valid_matches def _standardize(self, cit, cleaned_cit_journal_title, mode='exact'): """ Processo auxiliar que realiza casamento de um título de periódico citado e valida casamentos, se houver mais de um. O processo de validação consiste em desambiguar os possíveis ISSN-Ls associados a um periódico citado usando dados de ano e volume da referência citada. :param cit: referência citada :param mode: mode de execução de casamento ['exact', 'fuzzy'] :param cleaned_cit_journal_title: título limpo do periódico citado :return: dicionário composto por dados normalizados """ if mode == 'fuzzy': matches = self.match_fuzzy(cleaned_cit_journal_title) else: matches = self.match_exact(cleaned_cit_journal_title) # Verifica se houve casamento com apenas com um ISSN-L e se é casamento exato if len(matches) == 1 and mode == 'exact': return self.mount_standardized_citation_data(status=STATUS_EXACT, issn_l=matches.pop()) # Verifica se houve casamento com mais de um ISSN-L ou se é casamento aproximado e houve apenas um casamento elif len(matches) > 1 or (mode == 'fuzzy' and len(matches)) == 1: # Carrega todos os ISSNs possiveis associados aos ISSN-Ls casados possible_issns = self.get_issns(matches) if possible_issns: # Monta chaves ISSN-ANO-VOLUME keys, mount_mode = self.extract_issn_year_volume_keys( cit, possible_issns) if keys: # Valida chaves na base de ano e volume cit_valid_matches = self.validate_match(keys) if len(cit_valid_matches) == 1: status = self.get_status(mode, mount_mode, 'default') return self.mount_standardized_citation_data( status, cit_valid_matches.pop()) elif len(cit_valid_matches) == 0: # Valida chaves na base de regressão linear cit_valid_matches = self.validate_match(keys, use_lr=True) if len(cit_valid_matches) == 1: status = self.get_status(mode, mount_mode, 'lr') return self.mount_standardized_citation_data( status, cit_valid_matches.pop()) elif len(cit_valid_matches) == 0: # Valida chaves na base de regressão linear com volume flexibilizado cit_valid_matches = self.validate_match( keys, use_lr_ml1=True) if len(cit_valid_matches) == 1: status = self.get_status( mode, mount_mode, 'lr-ml1') return self.mount_standardized_citation_data( status, cit_valid_matches.pop()) def standardize(self, document): """ Normaliza referências citadas de um artigo. Atua de duas formas: exata e aproximada. Persiste resultados em arquivo JSON ou em MongoDB. :param document: Article dos quais as referências citadas serão normalizadas """ std_citations = {} if document.citations: for c, cit in enumerate([ dc for dc in document.citations if dc.publication_type == 'article' ]): cit_id = self.mount_id(cit, document.collection_acronym) cit_current_status = self.get_citation_mongo_status(cit_id) if cit_current_status == STATUS_NOT_NORMALIZED: cleaned_cit_journal_title = preprocess_journal_title( cit.source) if cleaned_cit_journal_title: if self.use_exact: exact_match_result = self._standardize( cit, cleaned_cit_journal_title) if exact_match_result: exact_match_result.update({ '_id': cit_id, 'cited-journal-title': cleaned_cit_journal_title }) std_citations[cit_id] = exact_match_result cit_current_status = exact_match_result[ 'status'] if self.use_fuzzy: if cit_current_status == STATUS_NOT_NORMALIZED: fuzzy_match_result = self._standardize( cit, cleaned_cit_journal_title, mode='fuzzy') if fuzzy_match_result: fuzzy_match_result.update({ '_id': cit_id, 'cited-journal-title': cleaned_cit_journal_title }) std_citations[cit_id] = fuzzy_match_result cit_current_status = fuzzy_match_result[ 'status'] if cit_current_status == STATUS_NOT_NORMALIZED and ( self.use_exact or self.use_fuzzy): unmatch_result = { '_id': cit_id, 'cited-journal-title': cleaned_cit_journal_title, 'status': STATUS_NOT_NORMALIZED, 'update-date': datetime.now().strftime('%Y-%m-%d') } std_citations[cit_id] = unmatch_result if std_citations: self.save_standardized_citations(std_citations)
# Create new threads threads = [] threadID = 0 for i in range(NR_OF_THREADS): contractCollection = MongoClient(MONGO_HOST, MONGO_PORT)[DATABASE][COLLECTION] thread = searchThread(threadID, contractQueue, contractCollection) thread.start() threads.append(thread) threadID += 1 contractCollection = MongoClient(MONGO_HOST, MONGO_PORT)[DATABASE][COLLECTION] cursor = contractCollection.find() print("Total number of smart contracts: " + str(contractCollection.count_documents({}))) uniques = set() contracts = [] distinct_bytecode = {} distinct_deployer = {} for contract in cursor: if not contract["creator"] in distinct_deployer: distinct_deployer[contract["creator"]] = 1 else: distinct_deployer[contract["creator"]] += 1 if not contract["byteCode"].encode("utf-8") in uniques: uniques.add(contract["byteCode"].encode("utf-8")) contracts.append(contract) distinct_bytecode[contract["byteCode"].encode("utf-8")] = 1 else:
authSource='admin' ).local.oplog.rs # convert current_date_int to timestamp ts = Timestamp(datetime.utcnow() - timedelta(minutes=10), 1) # query records for only delete ops with in last 30 mins query = { '$and': [ { 'ts': { '$gte': ts } }, { 'op': { '$in': ['u'] } }, { 'ns': { '$regex': 'Cluster0.', '$options': 'i' } } ] } if oplog.count_documents(query): cursor = oplog.find(query) for doc in cursor: pprint(doc) db, collection = doc['ns'].split('.') # connect to local server local_client = MongoClient( '127.0.0.1:27017', username='******', password='******', authSource='admin' ) local_db = local_client[db][collection] updateQuery = { '_id': doc['o2']['_id'] }
text = data["text"] if user_id not in reviewsByUser: reviewsByUser[user_id] = [] reviewsByUser[user_id].append(text) for i, val in reviewsByUser.items(): USER_D.insert_one({"USER_ID": i, "TEXT": val}) reviewsByBusiness = {} with open(dataset_file) as dataset: next(dataset) for line in dataset: try: data = json.loads(line) except ValueError: print('Oops!') business_id = data["business_id"] text = data["text"] if business_id not in reviewsByBusiness: reviewsByBusiness[business_id] = [] reviewsByBusiness[business_id].append(text) for i, val in reviewsByBusiness.items(): BUSINESS_D.insert_one({"BUSINESS_ID": i, "TEXT": val}) print("The Number of Unique users in the databse is", USER_D.count_documents({})) print("The Number of Unique Businesses in the databse is", BUSINESS_D.count_documents({}))
class MongoToElasticsearch(): def __init__(self, index=ES_INDEX, collection=MONGO_COL): self._es = Elasticsearch(ES_HOSTS) self._index = index self._col = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][collection] self._setup_index() def _setup_index(self): if not self._es.indices.exists(self._index): self._es.indices.create( index=self._index, body={ 'settings': { 'index': { 'refresh_interval': '1m' } }, 'mappings': { ES_TYPE: { 'properties': { 'meta': { 'properties': { 'location': { 'type': 'geo_point' } } } } } } } ) def _transform(self, obj): action = { '_index': self._index, '_id': str(obj['_id']), '_type': ES_TYPE } del obj['_id'] if obj[ES_STATE] == 'delete': action['_op_type'] = 'delete' del obj[ES_STATE] action['_source'] = obj return action def _insert_batch(self, batch): mongo_batch = [] for ok, result in helpers.parallel_bulk(self._es, batch): action, result = result.popitem() oid = ObjectId(result['_id']) if ok: mongo_update = UpdateOne( {'_id': oid}, {'$set': {ES_STATE: 'complete'}} ) mongo_batch.append(mongo_update) else: mongo_update = UpdateOne( {'_id': oid}, {'$set': {ES_STATE: 'error'}} ) mongo_batch.append(mongo_update) print('Failed to %s: %s', action, result['_id']) self._col.bulk_write(mongo_batch) def run(self): batch = [] query = { '$or': [ {ES_STATE: 'insert'}, {ES_STATE: 'update'}, {ES_STATE: 'remove'} ] } with tqdm(total=self._col.count_documents(query)) as pbar: for obj in self._col.find(query): batch.append(self._transform(obj)) if len(batch) == BATCH_SIZE: self._insert_batch(batch) batch = [] pbar.update(BATCH_SIZE) # Flush remaining self._insert_batch(batch)