Beispiel #1
0
 def entry_generator(self):
     col_zakon = utils.get_collection(
         const.CONF_MONGO_ZAKON, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     col_tlac = utils.get_collection(
         const.CONF_MONGO_HLASOVANIETLAC, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in col_zakon.iterate_all():
         zakon = col_tlac.get({const.MONGO_ID: entry[const.MONGO_ID]})
         hlasovania = zakon.get(const.HLASOVANIETLAC_LIST, {})
         zmeny = entry.get(const.ZAKON_ZMENY, {})
         ids = sorted(zmeny.keys())
         names = [zmeny[i][const.ZAKON_ZMENY_PREDKLADATEL].split(",")[0] for i in ids]
         hlas_text = pd.Series({
             key: value[const.HLASOVANIE_NAZOV].split("Hlasovanie")[-1] 
             for key, value in hlasovania.items() 
             if "druhé čítanie" in value[const.HLASOVANIE_NAZOV]
         })
         if len(hlas_text) == 0:
             continue
         counts = [0] * len(ids)
         for j, name in enumerate(names):
             if names.count(name) > 1:
                 counts[j] = names[:j+1].count(name)
         for j, i in enumerate(ids):
             hlas_name = hlas_text[hlas_text.str.contains(names[j][:-1])]
             if counts[j] > 0:
                 hlas_name = hlas_name[hlas_name.str.contains("{}. návrh".format(counts[j]))]
             for id_hlas, text in hlas_name.items():
                 if not "dopracovanie" in text and not "preložiť" in text:
                     yield {
                         const.NEO4J_BEGINNING_ID: int(id_hlas),
                         const.NEO4J_ENDING_ID: int(i)
                     }
Beispiel #2
0
    def create_optimizer(self, optimizer=None):
        model = get_collection('model')
        inputs = get_collection('inputs')

        alpha, l1_ratio = self.alpha, self.l1_ratio
        (x, y, class_weights, learning_rate, theta, logits, probabilities,
         predictions) = (inputs['x'], inputs['y'], inputs['class_weights'],
                         inputs['learning_rate'], model['theta'],
                         model['logits'], model['probabilities'],
                         model['predictions'])

        with tf.name_scope('metrics'):
            xe = cross_entropy(self.n_classes, logits=logits, labels=y)
            loss = tf.reduce_mean(xe, name='loss')
            weights = tf.reduce_sum(class_weights * y, axis=1)
            weighted_loss = tf.reduce_mean(xe * weights, name='weighted_loss')
            penalty = elastic_net(theta, l1_ratio=l1_ratio)
            penalized_loss = tf.add(weighted_loss,
                                    alpha * penalty,
                                    name='penalized_loss')
            targets = tf.argmax(y, axis=1, name='targets')
            match = tf.cast(tf.equal(predictions, targets), tf.float32)
            accuracy = tf.reduce_mean(match, name='accuracy')
        add_to_collection('metrics', loss, penalized_loss, accuracy)

        with tf.name_scope('training'):
            opt = tf.train.GradientDescentOptimizer(learning_rate)
            training_op = opt.minimize(penalized_loss)
        add_to_collection('training', training_op)
Beispiel #3
0
    def create_optimizer(self, optimizer=None):
        model = get_collection('model')
        inputs = get_collection('inputs')

        x, y, logits, probabilities, predictions = (
            inputs['x'],
            inputs['y'],
            model['logits'],
            model['probabilities'],
            model['predictions']
        )

        with tf.name_scope('metrics'):
            xe = cross_entropy(self.n_classes, logits=logits, labels=y)
            loss = tf.reduce_mean(xe, name='loss')
            targets = tf.argmax(y, axis=1, name='targets')
            match = tf.cast(tf.equal(predictions, targets), tf.float32)
            accuracy = tf.reduce_mean(match, name='accuracy')
        add_to_collection('metrics', loss, accuracy)

        with tf.name_scope('training'):
            if optimizer is None:
                optimizer = tf.train.GradientDescentOptimizer
            opt = optimizer(inputs['learning_rate'])
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                training_op = opt.minimize(loss)
        add_to_collection('training', training_op)
Beispiel #4
0
 def __init__(self, db, conf, source_collection=None, target_collection=None):
     self.db = db
     self.conf = conf
     if source_collection is None:
         source_collection = utils.get_collection(
             self, self.conf, const.CONF_MONGO_RAW, self.db)
     if target_collection is None:
         target_collection = utils.get_collection(
             self, self.conf, const.CONF_MONGO_PARSED, self.db)
     self.source_collection = source_collection
     self.target_collection = target_collection
     self.log = logging.getLogger(str(self.__class__).split("'")[1])
     self.unique_ids = [const.MONGO_ID]
Beispiel #5
0
    def score(self, X, y):
        """Scores model quality on testing dataset, returning dictionary with
        model's metrics.
        """
        self._check_if_session_exists()

        graph = self.graph
        with graph.as_default():
            inputs = get_collection('inputs')
            metrics = get_collection('metrics')

        feed = self.generate_feed(tensors=inputs, x=X, y=y, training=False)
        scores = self._session.run(metrics, feed)
        return scores
Beispiel #6
0
    def predict_proba(self, X):
        """Predicts classes probabilities for dataset."""

        self._check_if_session_exists()

        graph = self.graph
        with graph.as_default():
            inputs = get_collection('inputs')
            model = get_collection('model')
            probabilities = model['probabilities']

        feed = self.generate_feed(tensors=inputs, x=X, training=False)
        probs = self._session.run(probabilities, feed)
        return probs
def worker():

    source_urls = get_source_urls()
    papers = utils.get_newspapers(source_urls)

    processed_articles = []
    for paper in papers:
        processed_articles += utils.process_articles_for_paper(paper)

    curator = ArticleCurator(processed_articles)
    curator.curate()

    # Make sure we actually have new articles first
    if curator.curated_articles:
        # get and empty collection before filling with new articles
        articles_collection = utils.get_collection('newsarticles')
        articles_collection.remove()
        # put new curated articles in the collection
        for article in curator.curated_articles:
            articles_collection.insert({
                'title': article.title,
                'summary': article.summary,
                'image_url': article.top_image,
                'url': article.url,
                'keywords': article.keywords
            })
Beispiel #8
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_HLASOVANIE, self.conf, const.CONF_MONGO_PARSED, self.db)
     for entry in source_collection.iterate_all():
         del entry[const.MONGO_TIMESTAMP]
         del entry[const.HLASOVANIE_INDIVIDUALNE]
         yield entry
Beispiel #9
0
def get_raw_gt_data():
    gt_raw = get_collection('lwevents',
                            query_filter={"name": "gatherTownUsersCheck"},
                            db=get_mongo_db_object())
    gt_raw['time'] = gt_raw['properties'].str['time']
    gt_raw['gatherTownUsers'] = gt_raw['properties'].str['gatherTownUsers']
    return gt_raw
Beispiel #10
0
def test_std_doc():
    doc = StdDoc(field1="lolz", field2="", field3=12)
    assert not hasattr(doc, "_id")
    doc.save()
    assert hasattr(doc, "_id")
    assert find_valid_parent_docs(type(doc)) == []
    # not mutable field
    mutated = False
    try:
        doc["field3"] = 1
        mutated = True
    except:
        pass
    assert mutated is False
    # mutable field but wrong type
    mutated = False
    try:
        doc["field1"] = 1
        mutated = True
    except:
        pass
    assert mutated is False
    # mutable and should have no errors
    doc["field1"] = "looo"
    doc.save()
    assert doc["field1"] == "looo"
    collec = get_collection(doc)
    elts = list(collec.find({"_id": doc._id}))
    assert len(elts) == 1
    assert elts[0]["field1"] == "looo"
Beispiel #11
0
def update_doubt():
    id = request.args.get('id')

    doubt = request.json.get('doubt')
    answer = request.json.get('answer')
    topic = request.json.get('topic')

    if isinstance(doubt, str) and isinstance(answer, str):
        documents = get_collection('doubts')
        query = {'_id': ObjectId(id)}

        if documents.find_one(query):
            new_values = {
                '$set': {
                    'doubt': doubt,
                    'answer': answer,
                    'topic': topic
                }
            }

            documents.update_one(query, new_values, upsert=True)
            return 'Accepted', 202
        else:
            return 'Doubt not found', 404
    else:
        return 'Bad request', 400
Beispiel #12
0
 def entry_generator(self):
     vybory = [
         entry[const.MONGO_ID] 
         for entry in storage.MongoCollection(self.db, "nodes_vybor").iterate_all()
     ]
     source_collection = utils.get_collection(
         const.CONF_MONGO_ZAKON, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     def result_form(entry, vybor, lehota):
         return  {
             const.NEO4J_BEGINNING_ID: vybor,
             const.NEO4J_ENDING_ID: entry[const.MONGO_ID],
             const.NAVRHNUTY_LEHOTA: lehota
         }
     for entry in source_collection.iterate_all():
         if const.ZAKON_ROZHODNUTIE_VYBORY in entry:
             sprava = entry[const.ZAKON_ROZHODNUTIE_VYBORY]
             if sprava == "":
                 break
             lehota = self.get_lehota(sprava)
             for vybor in vybory:
                 flag = False
                 if vybor in sprava:
                     result = result_form(entry, vybor, lehota)
                     result[const.NAVRHNUTY_TYP] = const.NAVRHNUTY_DOPLNUJUCI
                     flag = True
                 if vybor in entry[const.ZAKON_ROZHODNUTIE_GESTORSKY]:
                     result = result_form(entry, vybor, lehota)
                     result[const.NAVRHNUTY_TYP] = const.NAVRHNUTY_GESTORSKY
                     flag = True
                 if flag:
                     yield result
Beispiel #13
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_ZMENA, self.conf, const.CONF_MONGO_PARSED, self.db)
     for entry in source_collection.iterate_all():
         entry.pop(const.ZMENA_PODPISANI, None)
         entry.pop(const.ZMENA_DALSI, None)
         entry.pop(const.ZMENA_PREDKLADATEL)
         yield entry
Beispiel #14
0
 def restore_ensemble_model(self):
     self.ensemble_graph = tf.Graph()
     self.ensemble_session = U.get_session(self.ensemble_graph)
     with self.ensemble_graph.as_default():
         saver = tf.train.import_meta_graph("checkpoint/ensemble.meta")
         saver.restore(self.ensemble_session, "checkpoint/ensemble")
     names = ["probs", "logits", "temperature_ph", "inputs"]
     self.ensemble_model = dict(
         zip(names, U.get_collection(names, self.ensemble_graph)))
Beispiel #15
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_ZAKON, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for zmena_id in entry.get(const.ZAKON_ZMENY, {}):
             yield {
                 const.NEO4J_BEGINNING_ID: int(zmena_id),
                 const.NEO4J_ENDING_ID: entry[const.MONGO_ID]
             }
Beispiel #16
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_HLASOVANIETLAC, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for hlasovanie_id in entry.get(const.HLASOVANIETLAC_LIST, {}):
             yield {
                 const.NEO4J_BEGINNING_ID: int(hlasovanie_id),
                 const.NEO4J_ENDING_ID: entry[const.MONGO_ID]
             }
Beispiel #17
0
def ingest_derivative():
    for bag in list_missing_ingest():
        mmsid = get_mmsid(bag)
        collection = get_collection(mmsid) if mmsid else None
        if collection is not None:
            pass
            # call islandora remote worker to ingest bag
            # update digital catalog
        else:
            print("Could not determine collection for: {0}".format(bag))
Beispiel #18
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_POSLANEC, self.conf, const.CONF_MONGO_PARSED, self.db)
     orgs = set()
     for entry in source_collection.iterate_all():
         for org in entry[const.POSLANEC_CLENSTVO]:
             if const.POSLANEC_DELEGACIA.lower() in org.lower():
                 orgs.add(org)
     for org in orgs:
         yield {const.MONGO_ID: org}
Beispiel #19
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_ZMENA, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for poslanec in entry.get(const.ZMENA_PODPISANI, []):
             yield {
                 const.NEO4J_BEGINNING_ID: utils.get_poslanec_id(self.db, poslanec),
                 const.NEO4J_ENDING_ID: entry[const.MONGO_ID]
             }
Beispiel #20
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_ZAKON, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         if const.ZAKON_GESTORSKY in entry:
             yield {
                 const.NEO4J_BEGINNING_ID: entry[const.ZAKON_GESTORSKY],
                 const.NEO4J_ENDING_ID: entry[const.MONGO_ID]
             }
Beispiel #21
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_LEGISLATIVNAINICIATIVA,
         self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for zakon_id in entry.get(const.PREDLOZILZAKON_LIST, {}):
             yield {
                 const.NEO4J_BEGINNING_ID: entry[const.MONGO_ID],
                 const.NEO4J_ENDING_ID: int(zakon_id)
             }
Beispiel #22
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_ROZPRAVA, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for vystupenie in entry[const.ROZPRAVA_VYSTUPENIA]:
             if const.ROZPRAVA_TLAC in vystupenie:
                 yield {
                     const.NEO4J_BEGINNING_ID: vystupenie[const.MONGO_ID],
                     const.NEO4J_ENDING_ID: vystupenie[const.ROZPRAVA_TLAC]
                 }
Beispiel #23
0
def worker():
    temperature_data = get_system_temp()
    temperature_data_collection = utils.get_collection('systemtemperaturedatas')
    try:
        temperature_data_collection.insert(temperature_data)
    except:
        # TODO: Add some logging should this fail
        return
    else:
        tweeter = CPUTemperatureTweeter()
        tweeter.tweet_it()
Beispiel #24
0
 def create_model(self):
     inputs = get_collection('inputs')
     with tf.name_scope('model'):
         init = tf.truncated_normal((self.n_features, self.n_classes))
         theta = tf.Variable(init, name='theta')
         bias = tf.Variable(0.0, name='bias')
         logits = tf.add(tf.matmul(inputs['x'], theta), bias, name='logits')
         activate = tf.nn.sigmoid if self.n_classes == 2 else tf.nn.softmax
         probabilities = activate(logits, name='probabilities')
         predictions = tf.argmax(probabilities, axis=1, name='predictions')
     add_to_collection('model', theta, logits, probabilities, predictions)
Beispiel #25
0
def delete_doubt():
    id = request.args.get('id')

    documents = get_collection('doubts')
    query = {'_id': ObjectId(id)}

    if documents.find_one(query):
        documents.remove(query)
        return 'Accepted', 202
    else:
        return 'Doubt not found', 404
Beispiel #26
0
 def create_model(self):
     inputs = get_collection('inputs')
     with tf.name_scope('model'):
         x = inputs['x']
         for layer_config in self.config:
             layer = Dense(**layer_config)
             x = layer.build(x, training=inputs['training'])
         logits = tf.layers.dense(x, units=self.n_classes, name='logits')
         activate = tf.nn.sigmoid if self.n_classes == 2 else tf.nn.softmax
         probabilities = activate(logits, name='probabilities')
         predictions = tf.argmax(probabilities, axis=1, name='predictions')
     add_to_collection('model', logits, probabilities, predictions)
Beispiel #27
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_HLASOVANIE, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     last_entry = source_collection.get({}, projection=[const.HLASOVANIE_INDIVIDUALNE],
         sort=[(const.MONGO_ID, -1)])
     hlasy = last_entry[const.HLASOVANIE_INDIVIDUALNE].values()
     kluby = [value[const.HLASOVANIE_KLUB] for value in hlasy] 
     values, counts = np.unique(kluby, return_counts=True)
     for val, count in zip(values, counts):
         val = utils.parse_klub(val)
         entry = {const.MONGO_ID: val, const.KLUB_POCET: int(count)}
         yield entry
Beispiel #28
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_POSLANEC, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for org, typ in entry[const.POSLANEC_CLENSTVO].items():
             if const.POSLANEC_DELEGACIA.lower() in org.lower():
                 result = {
                     const.NEO4J_BEGINNING_ID: entry[const.MONGO_ID],
                     const.NEO4J_ENDING_ID: org,
                     const.CLEN_TYP: const.CLEN_TYP_DICT[typ]
                 }
                 yield result
Beispiel #29
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_HLASOVANIE, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for poslanec_id, poslanec in entry[const.HLASOVANIE_INDIVIDUALNE].items():
             hlas = {
                 const.NEO4J_BEGINNING_ID: int(poslanec_id),
                 const.NEO4J_ENDING_ID: entry[const.MONGO_ID],
                 const.HLASOVAL_HLAS: const.HLASOVAL_HLAS_DICT[poslanec[const.HLASOVANIE_HLAS]],
                 const.HLASOVAL_KLUB: utils.parse_klub(poslanec[const.HLASOVANIE_KLUB])
             }
             yield hlas
Beispiel #30
0
    def setUpClass(cls):
        coll = get_collection(GeometryModel)

        GeometryModel.objects.create(geom=cls.point)
        GeometryModel.objects.create(geom=cls.line)
        GeometryModel.objects.create(geom=cls.polygon)
        GeometryModel.objects.create(geom=cls.multi_point)
        GeometryModel.objects.create(geom=cls.multi_line)
        GeometryModel.objects.create(geom=cls.multi_polygon)
        GeometryModel.objects.create(geom=cls.geom_collection)

        # not sure why the tests don't create the index...
        coll.ensure_index([('geom', pymongo.GEOSPHERE)])
Beispiel #31
0
def get_doubts():
    documents = get_collection('doubts')
    output = []
    for document in documents.find({}):
        output.append({
            '_id': str(document['_id']),
            'doubt': document['doubt'],
            'answer': document['answer'],
            'topic': document['topic'],
            'user': document['user'],
        })

    return jsonify(output), 200
Beispiel #32
0
def get_topics_from_user():
    user = request.args.get('user')

    if user_exists(user):
        documents = get_collection('doubts')
        topics = []
        query = {'user': user}
        for document in documents.find(query):
            if not document['topic'] in topics:
                topics.append(document['topic'])

        return jsonify(topics), 200
    else:
        return 'User not found', 404
Beispiel #33
0
 def entry_generator(self):
     source_collection = utils.get_collection(
         const.CONF_MONGO_ROZPRAVA, self.conf, const.CONF_MONGO_PARSED, self.db
     )
     for entry in source_collection.iterate_all():
         for vystupenie in entry[const.ROZPRAVA_VYSTUPENIA]:
             klub = vystupenie[const.ROZPRAVA_POSLANEC_KLUB]
             klub = const.KLUB_DICT.get("Klub " + klub, const.NEO4J_NULLVALUE)
             yield {
                 const.NEO4J_BEGINNING_ID: entry[const.MONGO_ID],
                 const.NEO4J_ENDING_ID: vystupenie[const.MONGO_ID],
                 const.ROZPRAVA_POSLANEC_KLUB: klub,
                 const.ROZPRAVA_POSLANEC_TYP: vystupenie[const.ROZPRAVA_POSLANEC_TYP]
             }
Beispiel #34
0
def run_import(csv_input_file, db_host, db_port, db_name, db_collection):
    """
    Imports to the CSV data to a MongoDB database.
    """

    print('Started CSV import - {0}'.format(datetime.now()))

    collection = utils.get_collection(db_host, db_port, db_name, db_collection)

    with open(csv_input_file, encoding='utf-8', errors='ignore') as csv_file:
        reader = csv.DictReader(csv_file)

        for json_obj in create_json(reader):
            collection.insert_one(json_obj)

    print('Finished: CSV import - {0}'.format(datetime.now()))
def worker():
    storage_data = get_system_storage()
    storage_data_collection = utils.get_collection('systemstoragedatas')
    storage_data_collection.insert(storage_data)
def worker():
    config_data = get_system_config()
    config_data_collection = utils.get_collection('systemconfigdatas')
    config_data_collection.insert(config_data)
Beispiel #37
0
def get_collection_view(request):
    ret = utils.get_collection(request)
    return  public.success_result_http(ret)
def worker():
    memory_data = get_system_memory()
    memory_data_collection = utils.get_collection('systemmemorydatas')
    memory_data_collection.insert(memory_data)
Beispiel #39
0
    def test_map_reduce(self, inline=False):
        mapfunc = """
            function map() {
                for(i=0; i<this.n; ++i) {
                    emit(this._id, this.m)
                }
            }
        """

        reducefunc = """
            function reduce(key, values) {
                var res = 0
                values.forEach(function(x) { res += x})
                return res
            }
        """

        if inline:
            map_reduce = MapReduceModel.objects.inline_map_reduce
        else:
            map_reduce = partial(MapReduceModel.objects.map_reduce,
                                 out='m/r-out')
        map_reduce = partial(map_reduce, mapfunc, reducefunc)

        random_numbers = [
            (3, 4),
            (6, 19),
            (5, 8),
            (0, 20), # This instance won't be emitted by `map`.
            (2, 77),
            (300, 10),
        ]

        for n, m in random_numbers:
            MapReduceModel(n=n, m=m).save()

        # Test mapfunc + reducefunc.
        documents = map_reduce()
        documents = list(documents)
        self.assertEqual(len(documents), len(random_numbers) - 1)
        self.assertEqual(sum(doc.value for doc in documents),
                         sum(n * m for n, m in random_numbers))

        # Test MapReduceResult.
        obj = documents[0].model.objects.get(id=documents[0].key)
        self.assert_(isinstance(obj, MapReduceModel))
        self.assertEqual((obj.n, obj.m), random_numbers[0])
        self.assert_(obj.id)

        # Collection should not have been perished.
        if not inline:
            result_collection = get_collection('m/r-out')
            self.assertEqual(result_collection.count(),
                             len(random_numbers) - 1)

            # Test drop_collection.
            map_reduce(drop_collection=True).next()
            self.assertEqual(get_collection('m/r-out').count(), 0)

        # Test arbitrary kwargs.
        documents = list(map_reduce(limit=3))
        self.assertEqual(len(documents), 3)
        self.assertEqual(sum(doc.value for doc in documents),
                         sum(n * m for n, m in random_numbers[:3]))

        # Test with .filter(...).
        qs = MapReduceModel.objects.filter(n__lt=300).filter(~Q(m__in=[4]))
        if inline:
            documents = qs.inline_map_reduce(mapfunc, reducefunc)
        else:
            documents = list(qs.map_reduce(mapfunc,
                                           reducefunc, out='m/r-out'))
        self.assertEqual(len(documents), len(random_numbers) - 2 - 1)
        self.assertEqual(sum(doc.value for doc in documents),
                         sum(n * m for n, m in random_numbers[1:-1]))
import utils
import csv

def create_feature_dict(names, values):
  output = {}
  for x in range(2, len(values)):
    output[names[x]] = float(values[x])
  return output

collection = utils.get_collection()
f = open('data/danfeatures.mar102011.csv', 'rb')
reader = csv.reader(f, delimiter=',')
is_header = True
headers = []
for row in reader:
  if is_header:
    headers = row
    is_header = False
    continue
  doc = utils.find_doc(int(row[0]), int(row[1]))
  if doc is not None:
    features = create_feature_dict(headers, row)
    gender = 'female'
    if int(row[0]) == doc['male_id']:
      gender = 'male'
    key = 'lexical_features.' + gender
    collection.update({'_id':doc['_id']}, {'$set': {key: features}}, False, True)