def load_clusters(): """Load info from topics.txt file into Cluster, TermCluster tables""" # Delete whatever's in the db already Cluster.query.delete() TermCluster.query.delete() count_clusters = 0 for row in open("topics.csv"): row = row.rstrip().split(",") # Parse the txt into the appropriate data types for seeding cluster = int(row[1][-3:]) word = row[3].strip() # Check if word is in our list of key terms. If it is, add to # TermCluster table to allow for lookup later (see seed.py for TODO) if Term.check_for_term(word) is True: term_cluster_to_add = TermCluster(word=word, cluster_id=cluster) db.session.add(term_cluster_to_add) db.session.commit() # Check if a cluster is in our list of clusters. If it's not, add it. if Cluster.check_for_cluster(cluster) is False: cluster_to_add = Cluster(cluster_id=cluster) db.session.add(cluster_to_add) db.session.commit() # Print where we are and increment counter print "Topics.txt seeding row", count_clusters count_clusters += 1
def load_clusters(): """Load info from topics.txt file into Cluster, TermCluster tables File format: R row id,Topic XXX,R column ID,word where XXX represents a number between 0-400 R ids can be discarded during seeding Source: topic clustering data from Neurosynth, converted to long format in R prior to seeding. Notes: the words tracked in this clustering are not in perfect alignment with those tracked in studies_terms.txt. Approximately 2000 of the terms in studies_terms have a topical cluster, the remaining ~1000 do not. This number could be improved by stemming. Many of the words not tracked in clusters are multi-word phrases.""" # Delete whatever's in the db already Cluster.query.delete() TermCluster.query.delete() count_clusters = 0 topics_fileobj = open('seed_data/topics.csv') for row in topics_fileobj: row = row.rstrip().split(',') # Parse the txt into the appropriate data types for seeding cluster = int(row[1][-3:]) word = row[3].strip() # Check if word is in our list of key terms. If it is, add to # TermCluster table to allow for lookup later (see model.py for TODO) if Term.check_for_term(word) is True: term_cluster_to_add = TermCluster(word=word, cluster_id=cluster) db.session.add(term_cluster_to_add) db.session.commit() # Check if a cluster is in our list of clusters. If it's not, add it. if Cluster.check_for_cluster(cluster) is False: cluster_to_add = Cluster(cluster_id=cluster) db.session.add(cluster_to_add) db.session.commit() # Print where we are and increment counter print "Topics.txt seeding row", count_clusters count_clusters += 1 topics_fileobj.close()