def populate_valid_filters(): global valid_filters # Define domain-label map for each filter # This defines what values are valid to filter on for each filter group valid_filters = { "topic": OrderedDict([ (row.topic, row.topic) for row in session.execute("SELECT topic FROM topics ORDER BY 1") ]), "subtopic": OrderedDict([(row.subtopic, row.subtopic) for row in session.execute( "SELECT subtopic FROM subtopics ORDER BY 1")]), "wave": OrderedDict([(row.name, row.name) for row in session.execute( "SELECT name FROM wave ORDER BY order_id")]), "respondent": OrderedDict([( row.respondent, row.respondent ) for row in session.execute( "SELECT DISTINCT(respondent) FROM variable3 WHERE respondent IS NOT NULL ORDER BY 1" )]), "data_source": OrderedDict([ (row.data_source, row.data_source) for row in session.execute( "SELECT DISTINCT(data_source) FROM variable3 WHERE data_source IS NOT NULL ORDER BY 1" ) ]), "data_type": OrderedDict([(row.data_type, row.data_type) for row in session.execute( "SELECT DISTINCT(data_type) FROM variable3 WHERE data_type IS NOT NULL ORDER BY 1" )]), "scale": OrderedDict([(row.scale, row.scale) for row in session.execute( "SELECT DISTINCT(scale) FROM variable3 WHERE scale IS NOT NULL ORDER BY 1" )]), "survey": OrderedDict([(row.survey, row.survey) for row in session.execute( "SELECT DISTINCT(survey) FROM variable3 WHERE survey IS NOT NULL ORDER BY 1" )]), "n_cities_asked": OrderedDict([ (row.n_cities_asked, row.n_cities_asked) for row in session.execute( "SELECT DISTINCT(n_cities_asked) FROM variable3 WHERE n_cities_asked IS NOT NULL ORDER BY 1" ) ]), "focal_person": OrderedDict([("fp_fchild", "Focal Child"), ("fp_mother", "Mother"), ("fp_father", "Father"), ("fp_PCG", "Primary Caregiver"), ("fp_partner", "Partner"), ("fp_other", "Other"), ("fp_none", "None")]) }
def testSearchNested(self): """ Test searching a variable given nested search criteria """ with app.app_context(): results = search([{ 'or': [{ 'name': 'wave', 'op': 'lte', 'val': 3 }, { 'name': 'name', 'op': 'like', 'val': '%z%' }] }, { 'name': 'data_source', 'op': 'eq', 'val': 'constructed' }], as_json=False) expected_n_results = next( session.execute( 'SELECT COUNT(*) FROM variable3 WHERE (wave<=3 OR name LIKE "%z%") AND data_source="constructed"' ))[0] self.assertEqual(len(results), expected_n_results)
def testSearchIsNull(self): """ Test searching a variable given a comparison for one of it's attributes (qtext) """ with app.app_context(): results = search({'name': 'qtext', 'op': 'is_null'}, as_json=False) expected_n_results = next( session.execute( 'SELECT COUNT(*) FROM variable3 WHERE qtext IS NULL'))[0] self.assertEqual(len(results), expected_n_results)
def populate_raw(csv_path, quiet=False): '''Load metadata from a csv file into the "raw" table''' if not quiet and input( 'This operation will delete all data fom the "raw" table and re-import it. ARE YOU SURE you want to proceed (yes/no)? ' ) != 'yes': return session.execute("DELETE FROM `raw2`") raw_table = Table("raw2", Base.metadata, autoload=True) with open(csv_path, encoding='utf-8') as f: reader = csv.DictReader(f) print('-----------\nPopulating raw table\n-----------') for i, _d in enumerate(reader, start=1): # Remove keys with empty values d = dict((k, _d[k]) for k in _d if _d[k] != '') session.execute(raw_table.insert(), [d]) if not i % 500: print('Added {} rows.'.format(i)) session.commit() session.commit()
def testSearchGt(self): """ Test searching a variable given a comparison for one of it's attributes (wave) """ with app.app_context(): results = search({ 'name': 'wave', 'op': 'gt', 'val': 3 }, as_json=False) expected_n_results = next( session.execute( 'SELECT COUNT(*) FROM variable3 WHERE wave>3'))[0] self.assertEqual(len(results), expected_n_results)
def testSearchMultiple(self): """ Test searching a variable given a multiple search criteria (implicitly combined by AND) """ with app.app_context(): results = search([{ 'name': 'wave', 'op': 'gt', 'val': 3 }, { 'name': 'name', 'op': 'like', 'val': '%z%' }], as_json=False) expected_n_results = next( session.execute( 'SELECT COUNT(*) FROM variable3 WHERE wave>3 AND name LIKE "%z%"' ))[0] self.assertEqual(len(results), expected_n_results)
def load_csv(): '''Load metadata to the specified database.''' session.execute('DELETE FROM `group`') session.execute('DELETE FROM `umbrella`') session.execute('DELETE FROM `response`') session.execute('DELETE FROM `topic`') session.execute('DELETE FROM `variable`') with open(os.path.join(os.path.dirname(ffmeta.__file__), current_app.config["METADATA_FILE"])) as t: rows = list(DictReader(t)) vars_loaded = 0 commit_increment = 1000 group_ids = [] umbrella_topics = set() for row in rows: # Determine group membership group_no = None group_sub = None groupclass = re.search("[A-z]+", str(row["group"])) if not groupclass: group_no = str(row["group"]) else: group_sub = re.search("[A-z]+", str(row["group"])).group(0) group_no = str(row["group"]).replace(group_sub, "") # Write variable data var = Variable(name=row["new_name"], label=row["varlab"].replace('"', "'"), old_name=row["old_name"], data_type=row["type"], warning=int(row["warning"]), group_id=group_no, group_subid=group_sub, data_source=row["source"], respondent=row["respondent"], wave=str(row["wave"]), scope=str(row["scope"]), section=row.get("section"), leaf=str(row["leaf"])) session.add(var) # Write topic data # Also, save umbrella data (we add this table later) topic1 = Topic(name=row["new_name"], topic=row["topic1"]) session.add(topic1) umbrella_topics.add((row["topic1"], row["umbrella1"])) if len(row["topic2"]) > 0: # Some rows have multiple topics (up to 2) topic2 = Topic(name=row["new_name"], topic=row["topic2"]) session.add(topic2) umbrella_topics.add((row["topic2"], row["umbrella2"])) # Write response data for key in row.keys(): if key.find("label") > -1 and len(row[key]) > 0: # Clean up response label respidx = key.replace("label", "") try: lab_pts = row[key].split(" ", 1) lab_pref = lab_pts[0] val = row["value" + respidx] if lab_pref == val: lab = lab_pts[1] # Drop the prefix if it's the response value else: lab = row[key] except IndexError: lab = row[key] # Default to the full entry if we can't clean up # Append new response row resp = Response(name=row["new_name"], label=lab, value=row["value" + respidx]) session.add(resp) # Add to group list group_ids.append(str(row["group"])) # Increment variable counter vars_loaded += 1 # Commit in increments of k if vars_loaded % commit_increment == 0: session.commit() # Commit any remaining rows session.commit() # Build groups table # TODO: The groups quality is bad -- revisit this tomorrow groups = Counter(group_ids) for group_id, group_n in groups.items(): grp = Group(group_id=group_id, count=group_n) session.add(grp) session.commit() # Build umbrellas table for topic, umbrella in umbrella_topics: umb = Umbrella(topic=topic, umbrella=umbrella) session.add(umb) session.commit() # Yield result return "Loaded {} rows to database.".format(str(vars_loaded))
def populate_tables(quiet=False): '''Load metadata from the `raw` table to other tables.''' if not quiet and input( 'This operation will import data from the "raw" table and will WIPE OUT data from all other tables. ARE YOU SURE you want to proceed (yes/no)? ' ) != 'yes': return session.execute('DELETE FROM `response2`') session.execute('DELETE FROM `variable3`') session.execute('DELETE FROM `topics`') session.execute('DELETE FROM `wave`') session.commit() distinct_topics = set() distinct_subtopics = set() distinct_waves = set() print('-----------\nPopulating variables\n-----------') for i, row in enumerate(session.execute('SELECT * FROM `raw2`'), start=1): name = row['new_name'] if row['varlab'] is not None: label = row['varlab'].replace( '"', "'") # replacement logic carried over from old import function else: label = row['varlab'] old_name = row['old_name'] data_type = row['type'] warning = row['warning'] group_id = row['group'] data_source = row['source'] respondent = row['respondent'] wave = row['wave'] distinct_waves.add(wave) n_cities_asked = row['n_cities_asked'] section = row['section'] leaf = row['leaf'] scale = row['scale'] probe = row['probe'] qtext = row['qtext'] survey = row['survey'] fp_fchild = row['fp_fchild'] fp_mother = row['fp_mother'] fp_father = row['fp_father'] fp_PCG = row['fp_PCG'] fp_partner = row['fp_partner'] fp_other = row['fp_other'] focal_person_dict = { 'fp_fchild': 'Focal Child', 'fp_mother': 'Mother', 'fp_father': 'Father', 'fp_PCG': 'Primary Caregiver', 'fp_partner': 'Partner', 'fp_other': 'Other' } l = locals() focal_person = ', '.join(v for k, v in focal_person_dict.items() if l[k]) topics = row['topics'] if topics is not None: for t in topics.split(';'): t = t.strip() if t: distinct_topics.add(t) subtopics = row['subtopics'] if subtopics is not None: for s in subtopics.split(';'): s = s.strip() if s: distinct_subtopics.add(s) in_FFC_file = row['in_FFC_file'] obs = row['obs'] min_ = row['min'] max_ = row['max'] avg = row['avg'] std = row['std'] variable = Variable(name=name, label=label, old_name=old_name, data_type=data_type, warning=warning, group_id=group_id, data_source=data_source, respondent=respondent, n_cities_asked=n_cities_asked, section=section, leaf=leaf, scale=scale, probe=probe, qtext=qtext, fp_fchild=fp_fchild, fp_mother=fp_mother, fp_father=fp_father, fp_PCG=fp_PCG, fp_partner=fp_partner, fp_other=fp_other, focal_person=focal_person, survey=survey, wave=wave, topics=topics, subtopics=subtopics, in_FFC_file=in_FFC_file, obs=obs, min=min_, max=max_, avg=avg, std=std) session.add(variable) # Write response data for key in row.keys(): if key.find("label") > -1 and row[key] is not None and len( row[key]) > 0: # Clean up response label respidx = key.replace("label", "") try: lab_pts = row[key].split(" ", 1) lab_pref = lab_pts[0] val = row["value" + respidx] if lab_pref == val: lab = lab_pts[ 1] # Drop the prefix if it's the response value else: lab = row[key] except IndexError: lab = row[ key] # Default to the full entry if we can't clean up # Append new response row resp = Response(name=row["new_name"], label=lab, value=row["value" + respidx], freq=row["freq" + respidx], per=row["per" + respidx]) session.add(resp) if not i % 500: print("Added {} rows.".format(i)) session.commit() for topic in distinct_topics: session.execute( 'INSERT INTO topics (topic) VALUES ("{}")'.format(topic)) for subtopic in distinct_subtopics: session.execute( 'INSERT INTO subtopics (subtopic) VALUES ("{}")'.format(subtopic)) for wave in distinct_waves: if wave is not None: order_id = wave.lstrip('Year ') try: order_id = int(order_id) except ValueError: order_id = 0 session.execute( "INSERT INTO wave (name, order_id) VALUES ('{}', {})".format( wave, order_id)) session.commit()