Exemple #1
0
def populate_valid_filters():
    global valid_filters
    # Define domain-label map for each filter
    # This defines what values are valid to filter on for each filter group
    valid_filters = {
        "topic":
        OrderedDict([
            (row.topic, row.topic)
            for row in session.execute("SELECT topic FROM topics ORDER BY 1")
        ]),
        "subtopic":
        OrderedDict([(row.subtopic, row.subtopic) for row in session.execute(
            "SELECT subtopic FROM subtopics ORDER BY 1")]),
        "wave":
        OrderedDict([(row.name, row.name) for row in session.execute(
            "SELECT name FROM wave ORDER BY order_id")]),
        "respondent":
        OrderedDict([(
            row.respondent, row.respondent
        ) for row in session.execute(
            "SELECT DISTINCT(respondent) FROM variable3 WHERE respondent IS NOT NULL ORDER BY 1"
        )]),
        "data_source":
        OrderedDict([
            (row.data_source, row.data_source) for row in session.execute(
                "SELECT DISTINCT(data_source) FROM variable3 WHERE data_source IS NOT NULL ORDER BY 1"
            )
        ]),
        "data_type":
        OrderedDict([(row.data_type, row.data_type) for row in session.execute(
            "SELECT DISTINCT(data_type) FROM variable3 WHERE data_type IS NOT NULL ORDER BY 1"
        )]),
        "scale":
        OrderedDict([(row.scale, row.scale) for row in session.execute(
            "SELECT DISTINCT(scale) FROM variable3 WHERE scale IS NOT NULL ORDER BY 1"
        )]),
        "survey":
        OrderedDict([(row.survey, row.survey) for row in session.execute(
            "SELECT DISTINCT(survey) FROM variable3 WHERE survey IS NOT NULL ORDER BY 1"
        )]),
        "n_cities_asked":
        OrderedDict([
            (row.n_cities_asked, row.n_cities_asked)
            for row in session.execute(
                "SELECT DISTINCT(n_cities_asked) FROM variable3 WHERE n_cities_asked IS NOT NULL ORDER BY 1"
            )
        ]),
        "focal_person":
        OrderedDict([("fp_fchild", "Focal Child"), ("fp_mother", "Mother"),
                     ("fp_father", "Father"), ("fp_PCG", "Primary Caregiver"),
                     ("fp_partner", "Partner"), ("fp_other", "Other"),
                     ("fp_none", "None")])
    }
 def testSearchNested(self):
     """
     Test searching a variable given nested search criteria
     """
     with app.app_context():
         results = search([{
             'or': [{
                 'name': 'wave',
                 'op': 'lte',
                 'val': 3
             }, {
                 'name': 'name',
                 'op': 'like',
                 'val': '%z%'
             }]
         }, {
             'name': 'data_source',
             'op': 'eq',
             'val': 'constructed'
         }],
                          as_json=False)
         expected_n_results = next(
             session.execute(
                 'SELECT COUNT(*) FROM variable3 WHERE (wave<=3 OR name LIKE "%z%") AND data_source="constructed"'
             ))[0]
         self.assertEqual(len(results), expected_n_results)
 def testSearchIsNull(self):
     """
     Test searching a variable given a comparison for one of it's attributes (qtext)
     """
     with app.app_context():
         results = search({'name': 'qtext', 'op': 'is_null'}, as_json=False)
         expected_n_results = next(
             session.execute(
                 'SELECT COUNT(*) FROM variable3 WHERE qtext IS NULL'))[0]
         self.assertEqual(len(results), expected_n_results)
Exemple #4
0
def populate_raw(csv_path, quiet=False):
    '''Load metadata from a csv file into the "raw" table'''

    if not quiet and input(
            'This operation will delete all data fom the "raw" table and re-import it. ARE YOU SURE you want to proceed (yes/no)? '
    ) != 'yes':
        return

    session.execute("DELETE FROM `raw2`")
    raw_table = Table("raw2", Base.metadata, autoload=True)
    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        print('-----------\nPopulating raw table\n-----------')
        for i, _d in enumerate(reader, start=1):
            # Remove keys with empty values
            d = dict((k, _d[k]) for k in _d if _d[k] != '')
            session.execute(raw_table.insert(), [d])
            if not i % 500:
                print('Added {} rows.'.format(i))
                session.commit()
        session.commit()
 def testSearchGt(self):
     """
     Test searching a variable given a comparison for one of it's attributes (wave)
     """
     with app.app_context():
         results = search({
             'name': 'wave',
             'op': 'gt',
             'val': 3
         },
                          as_json=False)
         expected_n_results = next(
             session.execute(
                 'SELECT COUNT(*) FROM variable3 WHERE wave>3'))[0]
         self.assertEqual(len(results), expected_n_results)
 def testSearchMultiple(self):
     """
     Test searching a variable given a multiple search criteria (implicitly combined by AND)
     """
     with app.app_context():
         results = search([{
             'name': 'wave',
             'op': 'gt',
             'val': 3
         }, {
             'name': 'name',
             'op': 'like',
             'val': '%z%'
         }],
                          as_json=False)
         expected_n_results = next(
             session.execute(
                 'SELECT COUNT(*) FROM variable3 WHERE wave>3 AND name LIKE "%z%"'
             ))[0]
         self.assertEqual(len(results), expected_n_results)
def load_csv():
    '''Load metadata to the specified database.'''

    session.execute('DELETE FROM `group`')
    session.execute('DELETE FROM `umbrella`')
    session.execute('DELETE FROM `response`')
    session.execute('DELETE FROM `topic`')
    session.execute('DELETE FROM `variable`')

    with open(os.path.join(os.path.dirname(ffmeta.__file__), current_app.config["METADATA_FILE"])) as t:
        rows = list(DictReader(t))
        vars_loaded = 0
        commit_increment = 1000
        group_ids = []
        umbrella_topics = set()
        for row in rows:
            # Determine group membership
            group_no = None
            group_sub = None
            groupclass = re.search("[A-z]+", str(row["group"]))
            if not groupclass:
                group_no = str(row["group"])
            else:
                group_sub = re.search("[A-z]+", str(row["group"])).group(0)
                group_no = str(row["group"]).replace(group_sub, "")

            # Write variable data
            var = Variable(name=row["new_name"],
                           label=row["varlab"].replace('"', "'"),
                           old_name=row["old_name"],
                           data_type=row["type"],
                           warning=int(row["warning"]),
                           group_id=group_no,
                           group_subid=group_sub,
                           data_source=row["source"],
                           respondent=row["respondent"],
                           wave=str(row["wave"]),
                           scope=str(row["scope"]),
                           section=row.get("section"),
                           leaf=str(row["leaf"]))
            session.add(var)

            # Write topic data
            # Also, save umbrella data (we add this table later)
            topic1 = Topic(name=row["new_name"], topic=row["topic1"])
            session.add(topic1)
            umbrella_topics.add((row["topic1"], row["umbrella1"]))
            if len(row["topic2"]) > 0:
                # Some rows have multiple topics (up to 2)
                topic2 = Topic(name=row["new_name"], topic=row["topic2"])
                session.add(topic2)
                umbrella_topics.add((row["topic2"], row["umbrella2"]))

            # Write response data
            for key in row.keys():
                if key.find("label") > -1 and len(row[key]) > 0:
                    # Clean up response label
                    respidx = key.replace("label", "")
                    try:
                        lab_pts = row[key].split(" ", 1)
                        lab_pref = lab_pts[0]
                        val = row["value" + respidx]
                        if lab_pref == val:
                            lab = lab_pts[1]  # Drop the prefix if it's the response value
                        else:
                            lab = row[key]
                    except IndexError:
                        lab = row[key]  # Default to the full entry if we can't clean up

                    # Append new response row
                    resp = Response(name=row["new_name"], label=lab, value=row["value" + respidx])
                    session.add(resp)

            # Add to group list
            group_ids.append(str(row["group"]))

            # Increment variable counter
            vars_loaded += 1

            # Commit in increments of k
            if vars_loaded % commit_increment == 0:
                session.commit()

        # Commit any remaining rows
        session.commit()

        # Build groups table
        # TODO: The groups quality is bad -- revisit this tomorrow
        groups = Counter(group_ids)
        for group_id, group_n in groups.items():
            grp = Group(group_id=group_id, count=group_n)
            session.add(grp)
        session.commit()

        # Build umbrellas table
        for topic, umbrella in umbrella_topics:
            umb = Umbrella(topic=topic, umbrella=umbrella)
            session.add(umb)
        session.commit()

    # Yield result
    return "Loaded {} rows to database.".format(str(vars_loaded))
Exemple #8
0
def populate_tables(quiet=False):
    '''Load metadata from the `raw` table to other tables.'''

    if not quiet and input(
            'This operation will import data from the "raw" table and will WIPE OUT data from all other tables. ARE YOU SURE you want to proceed (yes/no)? '
    ) != 'yes':
        return

    session.execute('DELETE FROM `response2`')
    session.execute('DELETE FROM `variable3`')
    session.execute('DELETE FROM `topics`')
    session.execute('DELETE FROM `wave`')

    session.commit()

    distinct_topics = set()
    distinct_subtopics = set()
    distinct_waves = set()

    print('-----------\nPopulating variables\n-----------')
    for i, row in enumerate(session.execute('SELECT * FROM `raw2`'), start=1):
        name = row['new_name']
        if row['varlab'] is not None:
            label = row['varlab'].replace(
                '"',
                "'")  # replacement logic carried over from old import function
        else:
            label = row['varlab']
        old_name = row['old_name']
        data_type = row['type']
        warning = row['warning']
        group_id = row['group']
        data_source = row['source']
        respondent = row['respondent']
        wave = row['wave']
        distinct_waves.add(wave)
        n_cities_asked = row['n_cities_asked']
        section = row['section']
        leaf = row['leaf']

        scale = row['scale']
        probe = row['probe']
        qtext = row['qtext']
        survey = row['survey']

        fp_fchild = row['fp_fchild']
        fp_mother = row['fp_mother']
        fp_father = row['fp_father']
        fp_PCG = row['fp_PCG']
        fp_partner = row['fp_partner']
        fp_other = row['fp_other']

        focal_person_dict = {
            'fp_fchild': 'Focal Child',
            'fp_mother': 'Mother',
            'fp_father': 'Father',
            'fp_PCG': 'Primary Caregiver',
            'fp_partner': 'Partner',
            'fp_other': 'Other'
        }
        l = locals()
        focal_person = ', '.join(v for k, v in focal_person_dict.items()
                                 if l[k])

        topics = row['topics']
        if topics is not None:
            for t in topics.split(';'):
                t = t.strip()
                if t:
                    distinct_topics.add(t)

        subtopics = row['subtopics']
        if subtopics is not None:
            for s in subtopics.split(';'):
                s = s.strip()
                if s:
                    distinct_subtopics.add(s)

        in_FFC_file = row['in_FFC_file']
        obs = row['obs']
        min_ = row['min']
        max_ = row['max']
        avg = row['avg']
        std = row['std']

        variable = Variable(name=name,
                            label=label,
                            old_name=old_name,
                            data_type=data_type,
                            warning=warning,
                            group_id=group_id,
                            data_source=data_source,
                            respondent=respondent,
                            n_cities_asked=n_cities_asked,
                            section=section,
                            leaf=leaf,
                            scale=scale,
                            probe=probe,
                            qtext=qtext,
                            fp_fchild=fp_fchild,
                            fp_mother=fp_mother,
                            fp_father=fp_father,
                            fp_PCG=fp_PCG,
                            fp_partner=fp_partner,
                            fp_other=fp_other,
                            focal_person=focal_person,
                            survey=survey,
                            wave=wave,
                            topics=topics,
                            subtopics=subtopics,
                            in_FFC_file=in_FFC_file,
                            obs=obs,
                            min=min_,
                            max=max_,
                            avg=avg,
                            std=std)

        session.add(variable)

        # Write response data
        for key in row.keys():
            if key.find("label") > -1 and row[key] is not None and len(
                    row[key]) > 0:
                # Clean up response label
                respidx = key.replace("label", "")
                try:
                    lab_pts = row[key].split(" ", 1)
                    lab_pref = lab_pts[0]
                    val = row["value" + respidx]
                    if lab_pref == val:
                        lab = lab_pts[
                            1]  # Drop the prefix if it's the response value
                    else:
                        lab = row[key]
                except IndexError:
                    lab = row[
                        key]  # Default to the full entry if we can't clean up

                # Append new response row
                resp = Response(name=row["new_name"],
                                label=lab,
                                value=row["value" + respidx],
                                freq=row["freq" + respidx],
                                per=row["per" + respidx])
                session.add(resp)

        if not i % 500:
            print("Added {} rows.".format(i))
            session.commit()

    for topic in distinct_topics:
        session.execute(
            'INSERT INTO topics (topic) VALUES ("{}")'.format(topic))

    for subtopic in distinct_subtopics:
        session.execute(
            'INSERT INTO subtopics (subtopic) VALUES ("{}")'.format(subtopic))

    for wave in distinct_waves:
        if wave is not None:
            order_id = wave.lstrip('Year ')
            try:
                order_id = int(order_id)
            except ValueError:
                order_id = 0
            session.execute(
                "INSERT INTO wave (name, order_id) VALUES ('{}', {})".format(
                    wave, order_id))

    session.commit()