Esempio n. 1
0
def main():
    collection = MongoClient(DATABASE_URL).abstracts.all

    print('Making requests...')
    requests = []
    requests.append(UpdateMany(
        { 'tags': 'matthew' },
        { '$addToSet': { 'tags': 'dataset2' } }
    ))
    requests.append(UpdateMany(
        { 'tags': { '$all': ['matthew', 'dataset2'] } },
        { '$pull': { 'tags': 'matthew' } }
    ))
    requests.append(UpdateMany(
        { 'tags': 'gabby' },
        { '$addToSet': { 'tags': 'dataset1' } }
    ))
    requests.append(UpdateMany(
        { 'tags': { '$all': ['gabby', 'dataset1'] } },
        { '$pull': { 'tags': 'gabby' } }
    ))

    print('Updating database...')
    response = collection.bulk_write(requests)
    print(f'Modified: {response.modified_count}')
Esempio n. 2
0
def main() -> int:
    """Expand station counts into date documents"""
    coll = MongoClient(environ["MONGO_URI"]).counter.station
    while item := coll.find_one({"date": {"$exists": False}}):
        icao = item.pop("_id")
        print(icao)
        counts = {}
        date = None
        for report, dates in item.items():
            for date, count in dates.items():
                date = datetime.strptime(date, r"%Y-%m-%d")
                try:
                    counts[date][report] = count
                except KeyError:
                    counts[date] = {report: count}
        updates = [make_update(icao, date, count) for date, count in counts.items()]
        print(icao, len(updates))
        coll.bulk_write(updates, ordered=False)
        print("Deleting")
        coll.delete_one({"_id": icao})
Esempio n. 3
0
class Scraper:
    nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
    processor = MaterialsTextProcessor()

    def __init__(self,
                 classifiers,
                 database='abstracts',
                 collection='all',
                 save_all=False,
                 gen_tag='food science'):
        """
        Initializes Scraper class

        :param classifiers: model to determine relevance of abstract
        :param database: defaults to 'abstracts', database to store abstracts in
        :param collection: defaults to 'all', collection to store abstracts in
        :param save_all: defaults to False, Bool flag to save all articles from query
        :param gen_tag: defaults to 'food science', name of tag to apply to all articles (required only if save_all is True)
        """
        self._classifiers = classifiers
        self._collection = MongoClient(DATABASE_URL)[database][collection]
        self._save = save_all
        self._gen_tag = gen_tag
        self._gen_new = 0
        self._gen_total = 0

        # create collection indices
        self._collection.create_index('doi',
                                      name='doi',
                                      unique=True,
                                      sparse=True)
        self._collection.create_index('uid',
                                      name='uid',
                                      unique=True,
                                      sparse=True)
        self._collection.create_index('pmc',
                                      name='pmc',
                                      unique=True,
                                      sparse=True)
        self._collection.create_index('tags', name='tags')
        self._collection.create_index('database', name='database')

    def _get_date(self, date):
        """
        Converts date into datetime object

        :param date: date formatted 'YYYY-MM-DD'
        """
        if not date:
            return None
        date_array = date.split('-')
        return datetime.datetime(int(date_array[0]), int(date_array[1]),
                                 int(date_array[2]))

    def _save_all(self, articles):
        """
        Stores all articles from database query (regardless of classifier result) under general tag

        :param articles: list of article objects to add to database
        :param doi: Bool flag for whether stored IDs are DOI
        """
        self._gen_total += len(articles)

        # creates request to store article with corresponding tag
        requests = []
        for article in articles:
            # creates document to insert by filtering out fields that are None
            doc = {k: v for k, v in article.items() if v is not None}
            doi = doc.get('doi')
            uid = doc.get('uid')
            pmc = doc.get('pmc')

            # sets either doi, uid, or pmc as the only id in that
            # preference order
            if doi:
                filter = {'doi': doi}
                doc.pop('uid', None)
                doc.pop('pmc', None)

            elif uid:
                filter = {'uid': uid}
                doc.pop('doi', None)
                doc.pop('pmc', None)
            else:
                filter = {'pmc': pmc}
                doc.pop('doi', None)
                doc.pop('uid', None)

            # if article is marked as relevant, inserts new document if it
            # does not exist and adds to tag
            requests.append(
                UpdateOne(filter, {
                    '$setOnInsert': doc,
                    '$addToSet': {
                        'tags': self._gen_tag
                    }
                },
                          upsert=True))

        # updates database
        if requests:
            mongo = self._collection.bulk_write(requests, ordered=False)
            self._gen_new += mongo.upserted_count + mongo.modified_count if mongo else 0

    def _store(self, articles, abstracts):
        """
        Classifies articles based on processed abstracts and stores in database
        if relevant

        :param articles: list of article objects to add to database
        :param abstracts: list of processed abstracts to be checked against classifier
        """
        for classifier in self._classifiers:
            classifier.total += len(articles)

            # uses classifier to determine if relevant
            predictions = classifier.predict(abstracts)

            # creates request to store article with corresponding tag
            requests = []
            for i, article in enumerate(articles):
                if predictions[i]:
                    # creates document to insert by filtering out fields that are None
                    doc = {k: v for k, v in article.items() if v is not None}
                    doi = doc.get('doi')
                    uid = doc.get('uid')
                    pmc = doc.get('pmc')
                    paperid = doc.get('paperid')  # unique s2orc paper id

                    # sets either doi, uid, or pmc as the only id in that
                    # preference order
                    if doi:
                        filter = {'doi': doi}
                        doc.pop('uid', None)
                        doc.pop('pmc', None)
                        doc.pop('paperid', None)
                    elif uid:
                        filter = {'uid': uid}
                        doc.pop('doi', None)
                        doc.pop('pmc', None)
                        doc.pop('paperid', None)
                    elif pmc:
                        filter = {'pmc': pmc}
                        doc.pop('doi', None)
                        doc.pop('uid', None)
                        doc.pop('paperid', None)
                    else:
                        filter = {'paperid': paperid}
                        doc.pop('doi', None)
                        doc.pop('uid', None)
                        doc.pop('pmc', None)

                    # if article is marked as relevant, inserts new document if it
                    # does not exist and adds to tag
                    requests.append(
                        UpdateOne(filter, {
                            '$setOnInsert': doc,
                            '$addToSet': {
                                'tags': classifier.tag
                            }
                        },
                                  upsert=True))

                # ignore irrelevant articles, but keep track of their number
                else:
                    classifier.irrelevant += 1

            # updates database
            if requests:
                mongo = self._collection.bulk_write(requests, ordered=False)
                classifier.relevant += mongo.upserted_count + mongo.modified_count if mongo else 0

        # if flag is marked True, store all articles from query to database
        if self._save:
            self._save_all(articles)
Esempio n. 4
0
    data_store = []

    g = GmapsGeocoder(members, data_store)
    threads = [Thread(target=g.get_member_locations) for i in range(10)]

    for i in no_location_members:
        members.put(i)

    for t in threads:
        t.start()
    try:
        members.join()
    except Exception:
        print("In exception")
    finally:
        c.bulk_write(data_store)








# pp = pprint.PrettyPrinter(indent=4)

# col = MongoClient()["tubules"]["members"]

# # Find all members who don't have location points in already
# for member in col.find({"location": {"$exists": False}})[1:]:
#     print(member)
Esempio n. 5
0
    filter = {"ip": {"$eq": host}}
    update = {"$push": {"timeline": {"time": timestamp, "up": True}}}
    writes.append(UpdateOne(filter, update, upsert=True))

for host in down_hosts:
    filter = {"ip": {"$eq": host}}
    update = {"$push": {"timeline": {"time": timestamp, "up": False}}}
    writes.append(UpdateOne(filter, update, upsert=True))

# if test:
# 	print(writes)
# else:
# ips.bulk_write(writes)

if len(writes) > 0:
    ips.bulk_write(writes)
print(writes)

# def callback(scan):
# report = find_hosts([host.ipv4 for host in up_hosts], '-Pn -A -n')#, callback=callback)
# report = find_hosts(["18.93.6.23", "18.33.0.158"], '-Pn -O -Sv -Sc -n')#, callback=callback)

# for host in report.hosts:
# 	pp.pprint(host.ipv4)
# 	pp.pprint(host.os)
# 	pp.pprint(host.services)
# 	pp.pprint(host.scripts_results)
# 	# pp.pprint(host.get_dict())
# 	# pp.pprint(host.os_class_probabilities())
# 	matches = host.os_match_probabilities()
# 	for match in matches:
Esempio n. 6
0
class MongoToElasticsearch():
  def __init__(self, index=ES_INDEX, collection=MONGO_COL):
    self._es = Elasticsearch(ES_HOSTS)
    self._index = index
    self._col = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][collection]

    self._setup_index()

  def _setup_index(self):
    if not self._es.indices.exists(self._index):
      self._es.indices.create(
        index=self._index,
        body={
          'settings': {
            'index': {
              'refresh_interval': '1m'
            }
          },
          'mappings': {
            ES_TYPE: {
              'properties': {
                'meta': {
                  'properties': {
                    'location': {
                      'type': 'geo_point'
                    }
                  }
                }
              }
            }
          }
        }
      )

  def _transform(self, obj):
    action = {
      '_index': self._index,
      '_id': str(obj['_id']),
      '_type': ES_TYPE
    }
    del obj['_id']

    if obj[ES_STATE] == 'delete':
      action['_op_type'] = 'delete'
    del obj[ES_STATE]

    action['_source'] = obj

    return action

  def _insert_batch(self, batch):
    mongo_batch = []

    for ok, result in helpers.parallel_bulk(self._es, batch):
      action, result = result.popitem()
      oid = ObjectId(result['_id'])

      if ok:
        mongo_update = UpdateOne(
          {'_id': oid},
          {'$set': {ES_STATE: 'complete'}}
        )
        mongo_batch.append(mongo_update)
      else:
        mongo_update = UpdateOne(
          {'_id': oid},
          {'$set': {ES_STATE: 'error'}}
        )
        mongo_batch.append(mongo_update)
        print('Failed to %s: %s', action, result['_id'])

    self._col.bulk_write(mongo_batch)

  def run(self):
    batch = []
    query = {
      '$or': [
        {ES_STATE: 'insert'},
        {ES_STATE: 'update'},
        {ES_STATE: 'remove'}
      ]
    }

    with tqdm(total=self._col.count_documents(query)) as pbar:
      for obj in self._col.find(query):
        batch.append(self._transform(obj))

        if len(batch) == BATCH_SIZE:
          self._insert_batch(batch)
          batch = []
          pbar.update(BATCH_SIZE)

    # Flush remaining
    self._insert_batch(batch)