def main(): collection = MongoClient(DATABASE_URL).abstracts.all print('Making requests...') requests = [] requests.append(UpdateMany( { 'tags': 'matthew' }, { '$addToSet': { 'tags': 'dataset2' } } )) requests.append(UpdateMany( { 'tags': { '$all': ['matthew', 'dataset2'] } }, { '$pull': { 'tags': 'matthew' } } )) requests.append(UpdateMany( { 'tags': 'gabby' }, { '$addToSet': { 'tags': 'dataset1' } } )) requests.append(UpdateMany( { 'tags': { '$all': ['gabby', 'dataset1'] } }, { '$pull': { 'tags': 'gabby' } } )) print('Updating database...') response = collection.bulk_write(requests) print(f'Modified: {response.modified_count}')
def main() -> int: """Expand station counts into date documents""" coll = MongoClient(environ["MONGO_URI"]).counter.station while item := coll.find_one({"date": {"$exists": False}}): icao = item.pop("_id") print(icao) counts = {} date = None for report, dates in item.items(): for date, count in dates.items(): date = datetime.strptime(date, r"%Y-%m-%d") try: counts[date][report] = count except KeyError: counts[date] = {report: count} updates = [make_update(icao, date, count) for date, count in counts.items()] print(icao, len(updates)) coll.bulk_write(updates, ordered=False) print("Deleting") coll.delete_one({"_id": icao})
class Scraper: nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner']) processor = MaterialsTextProcessor() def __init__(self, classifiers, database='abstracts', collection='all', save_all=False, gen_tag='food science'): """ Initializes Scraper class :param classifiers: model to determine relevance of abstract :param database: defaults to 'abstracts', database to store abstracts in :param collection: defaults to 'all', collection to store abstracts in :param save_all: defaults to False, Bool flag to save all articles from query :param gen_tag: defaults to 'food science', name of tag to apply to all articles (required only if save_all is True) """ self._classifiers = classifiers self._collection = MongoClient(DATABASE_URL)[database][collection] self._save = save_all self._gen_tag = gen_tag self._gen_new = 0 self._gen_total = 0 # create collection indices self._collection.create_index('doi', name='doi', unique=True, sparse=True) self._collection.create_index('uid', name='uid', unique=True, sparse=True) self._collection.create_index('pmc', name='pmc', unique=True, sparse=True) self._collection.create_index('tags', name='tags') self._collection.create_index('database', name='database') def _get_date(self, date): """ Converts date into datetime object :param date: date formatted 'YYYY-MM-DD' """ if not date: return None date_array = date.split('-') return datetime.datetime(int(date_array[0]), int(date_array[1]), int(date_array[2])) def _save_all(self, articles): """ Stores all articles from database query (regardless of classifier result) under general tag :param articles: list of article objects to add to database :param doi: Bool flag for whether stored IDs are DOI """ self._gen_total += len(articles) # creates request to store article with corresponding tag requests = [] for article in articles: # creates document to insert by filtering out fields that are None doc = {k: v for k, v in article.items() if v is not None} doi = doc.get('doi') uid = doc.get('uid') pmc = doc.get('pmc') # sets either doi, uid, or pmc as the only id in that # preference order if doi: filter = {'doi': doi} doc.pop('uid', None) doc.pop('pmc', None) elif uid: filter = {'uid': uid} doc.pop('doi', None) doc.pop('pmc', None) else: filter = {'pmc': pmc} doc.pop('doi', None) doc.pop('uid', None) # if article is marked as relevant, inserts new document if it # does not exist and adds to tag requests.append( UpdateOne(filter, { '$setOnInsert': doc, '$addToSet': { 'tags': self._gen_tag } }, upsert=True)) # updates database if requests: mongo = self._collection.bulk_write(requests, ordered=False) self._gen_new += mongo.upserted_count + mongo.modified_count if mongo else 0 def _store(self, articles, abstracts): """ Classifies articles based on processed abstracts and stores in database if relevant :param articles: list of article objects to add to database :param abstracts: list of processed abstracts to be checked against classifier """ for classifier in self._classifiers: classifier.total += len(articles) # uses classifier to determine if relevant predictions = classifier.predict(abstracts) # creates request to store article with corresponding tag requests = [] for i, article in enumerate(articles): if predictions[i]: # creates document to insert by filtering out fields that are None doc = {k: v for k, v in article.items() if v is not None} doi = doc.get('doi') uid = doc.get('uid') pmc = doc.get('pmc') paperid = doc.get('paperid') # unique s2orc paper id # sets either doi, uid, or pmc as the only id in that # preference order if doi: filter = {'doi': doi} doc.pop('uid', None) doc.pop('pmc', None) doc.pop('paperid', None) elif uid: filter = {'uid': uid} doc.pop('doi', None) doc.pop('pmc', None) doc.pop('paperid', None) elif pmc: filter = {'pmc': pmc} doc.pop('doi', None) doc.pop('uid', None) doc.pop('paperid', None) else: filter = {'paperid': paperid} doc.pop('doi', None) doc.pop('uid', None) doc.pop('pmc', None) # if article is marked as relevant, inserts new document if it # does not exist and adds to tag requests.append( UpdateOne(filter, { '$setOnInsert': doc, '$addToSet': { 'tags': classifier.tag } }, upsert=True)) # ignore irrelevant articles, but keep track of their number else: classifier.irrelevant += 1 # updates database if requests: mongo = self._collection.bulk_write(requests, ordered=False) classifier.relevant += mongo.upserted_count + mongo.modified_count if mongo else 0 # if flag is marked True, store all articles from query to database if self._save: self._save_all(articles)
data_store = [] g = GmapsGeocoder(members, data_store) threads = [Thread(target=g.get_member_locations) for i in range(10)] for i in no_location_members: members.put(i) for t in threads: t.start() try: members.join() except Exception: print("In exception") finally: c.bulk_write(data_store) # pp = pprint.PrettyPrinter(indent=4) # col = MongoClient()["tubules"]["members"] # # Find all members who don't have location points in already # for member in col.find({"location": {"$exists": False}})[1:]: # print(member)
filter = {"ip": {"$eq": host}} update = {"$push": {"timeline": {"time": timestamp, "up": True}}} writes.append(UpdateOne(filter, update, upsert=True)) for host in down_hosts: filter = {"ip": {"$eq": host}} update = {"$push": {"timeline": {"time": timestamp, "up": False}}} writes.append(UpdateOne(filter, update, upsert=True)) # if test: # print(writes) # else: # ips.bulk_write(writes) if len(writes) > 0: ips.bulk_write(writes) print(writes) # def callback(scan): # report = find_hosts([host.ipv4 for host in up_hosts], '-Pn -A -n')#, callback=callback) # report = find_hosts(["18.93.6.23", "18.33.0.158"], '-Pn -O -Sv -Sc -n')#, callback=callback) # for host in report.hosts: # pp.pprint(host.ipv4) # pp.pprint(host.os) # pp.pprint(host.services) # pp.pprint(host.scripts_results) # # pp.pprint(host.get_dict()) # # pp.pprint(host.os_class_probabilities()) # matches = host.os_match_probabilities() # for match in matches:
class MongoToElasticsearch(): def __init__(self, index=ES_INDEX, collection=MONGO_COL): self._es = Elasticsearch(ES_HOSTS) self._index = index self._col = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][collection] self._setup_index() def _setup_index(self): if not self._es.indices.exists(self._index): self._es.indices.create( index=self._index, body={ 'settings': { 'index': { 'refresh_interval': '1m' } }, 'mappings': { ES_TYPE: { 'properties': { 'meta': { 'properties': { 'location': { 'type': 'geo_point' } } } } } } } ) def _transform(self, obj): action = { '_index': self._index, '_id': str(obj['_id']), '_type': ES_TYPE } del obj['_id'] if obj[ES_STATE] == 'delete': action['_op_type'] = 'delete' del obj[ES_STATE] action['_source'] = obj return action def _insert_batch(self, batch): mongo_batch = [] for ok, result in helpers.parallel_bulk(self._es, batch): action, result = result.popitem() oid = ObjectId(result['_id']) if ok: mongo_update = UpdateOne( {'_id': oid}, {'$set': {ES_STATE: 'complete'}} ) mongo_batch.append(mongo_update) else: mongo_update = UpdateOne( {'_id': oid}, {'$set': {ES_STATE: 'error'}} ) mongo_batch.append(mongo_update) print('Failed to %s: %s', action, result['_id']) self._col.bulk_write(mongo_batch) def run(self): batch = [] query = { '$or': [ {ES_STATE: 'insert'}, {ES_STATE: 'update'}, {ES_STATE: 'remove'} ] } with tqdm(total=self._col.count_documents(query)) as pbar: for obj in self._col.find(query): batch.append(self._transform(obj)) if len(batch) == BATCH_SIZE: self._insert_batch(batch) batch = [] pbar.update(BATCH_SIZE) # Flush remaining self._insert_batch(batch)