Beispiel #1
0
def load_clusters():
    """Load info from topics.txt file into Cluster, TermCluster tables"""

    # Delete whatever's in the db already
    Cluster.query.delete()
    TermCluster.query.delete()

    count_clusters = 0

    for row in open("topics.csv"):

        row = row.rstrip().split(",")

        # Parse the txt into the appropriate data types for seeding
        cluster = int(row[1][-3:])
        word = row[3].strip()

        # Check if word is in our list of key terms. If it is, add to
        # TermCluster table to allow for lookup later (see seed.py for TODO)

        if Term.check_for_term(word) is True:
            term_cluster_to_add = TermCluster(word=word, cluster_id=cluster)
            db.session.add(term_cluster_to_add)
            db.session.commit()

        # Check if a cluster is in our list of clusters. If it's not, add it.
        if Cluster.check_for_cluster(cluster) is False:
            cluster_to_add = Cluster(cluster_id=cluster)
            db.session.add(cluster_to_add)
            db.session.commit()

        # Print where we are and increment counter
        print "Topics.txt seeding row", count_clusters

        count_clusters += 1
Beispiel #2
0
def load_studies_terms():
    """Loads info from studies_terms.txt into StudyTerm & Term tables.

    File format: R ID \t pmid \t word \t frequency

    Source: Neurosynth features.txt, transformed in R to long format."""


    print "Studies_terms.txt seeding"

    # Delete all rows in existing tables, so if we need to run this a second time,
    # we won't be trying to add duplicate users
    StudyTerm.query.delete()
    Term.query.delete()

    skip = True
    count_studies_terms = 0
    studies_terms = open("seed_data/studies_terms.txt")

    for row in studies_terms:
        # Skip the first line of the file
        if skip:
            skip = False
            continue

        # Stop after 5000 lines
        # if count_studies_terms > 5000:
        #     break

        # Parse txt file and convert to appropriate data types for seeding
        row = row.rstrip().split('\t')

        # If the term starts with "X", it is not a word but a number, e.g. "X01"
        # These don't make sense to track, so skip these rows.
        if row[2].startswith('\"X'):
            continue

        # Skip the lines indicating that a term did not appear anywhere
        # in the article (frequency of 0)
        if float(row[3]) == 0.0:
            continue

        pmid = int(row[1])
        word = row[2].strip('\"').replace(".", " ")
        freq = float(row[3])

        # Check if the word is already in Term; if not, add it
        if Term.check_for_term(word) is False:
            word_to_add = Term(word=word)
            db.session.add(word_to_add)

        # Add the row to the studies_terms table
        studies_terms_to_add = StudyTerm(word=word, pmid=pmid, frequency=freq)
        db.session.add(studies_terms_to_add)
        db.session.commit()

        # Print where we are and increment counter
        print "studies_terms.txt seeding row ", count_studies_terms
        count_studies_terms += 1
Beispiel #3
0
def load_clusters():
    """Load info from topics.txt file into Cluster, TermCluster tables

    File format: R row id,Topic XXX,R column ID,word

        where XXX represents a number between 0-400
        R ids can be discarded during seeding 

    Source: topic clustering data from Neurosynth, converted to long format
    in R prior to seeding. 
    Notes: the words tracked in this clustering are not in perfect
    alignment with those tracked in studies_terms.txt. Approximately 2000 of the 
    terms in studies_terms have a topical cluster, the remaining ~1000 do not.
    This number could be improved by stemming. Many of the words not tracked
    in clusters are multi-word phrases."""

    # Delete whatever's in the db already
    Cluster.query.delete()
    TermCluster.query.delete()

    count_clusters = 0
    topics_fileobj = open('seed_data/topics.csv')

    for row in topics_fileobj:

        row = row.rstrip().split(',')

        # Parse the txt into the appropriate data types for seeding
        cluster = int(row[1][-3:])
        word = row[3].strip()

        # Check if word is in our list of key terms. If it is, add to
        # TermCluster table to allow for lookup later (see model.py for TODO)

        if Term.check_for_term(word) is True:
            term_cluster_to_add = TermCluster(word=word, cluster_id=cluster)
            db.session.add(term_cluster_to_add)
            db.session.commit()

        # Check if a cluster is in our list of clusters. If it's not, add it.
        if Cluster.check_for_cluster(cluster) is False:
            cluster_to_add = Cluster(cluster_id=cluster)
            db.session.add(cluster_to_add)
            db.session.commit()

        # Print where we are and increment counter
        print "Topics.txt seeding row", count_clusters

        count_clusters += 1

    topics_fileobj.close()