Ejemplo n.º 1
0
def add_pub_concepts(year):

    #this is all to slow, used neo4j csv import instead

    #using periodic commit 10000 load CSV from "file:///pub_concept_2008.txt" as row FIELDTERMINATOR '\t' match (p:Publication{pub_id:toInt(row[0])}) match (c:Concept{name:row[1]}) merge (p)-[:CONCEPT{year:2008}]-(c) return count(p);

    print "Adding pub-concept data"

    session = neo4j_functions.connect()

    #read data
    pubConceptDic = readPubConcepts(year)
    counter = 0
    for p in pubConceptDic:
        for c in pubConceptDic[p]:
            c = c.replace("'", "\\'")
            if counter % 10000 == 0:
                t = strftime("%H:%M:%S", gmtime())
                print t, counter
                session.close()
                session = neo4j_functions.connect()
            com = "match (p:Publication {pub_id:" + str(
                p
            ) + "}) match (c:Concept {name:'" + c + "'}) merge (p)-[:CONCEPT{year:" + str(
                year) + "}]-(c);"
            session.run(com)
            counter += 1
Ejemplo n.º 2
0
def pubs_per_org():
    session = neo4j_functions.connect()
    com = 'match (o:Org)-[:MEMBER_OF]-(s:Staff)-[:PUBLISHED]-(p:Publication) return o.code as o,count(distinct(p)) as c;'
    oDic = {}
    for res in session.run(com):
        oDic[res['o']] = res['c']
    return oDic
Ejemplo n.º 3
0
def staff_per_org():
    session = neo4j_functions.connect()
    com = ' match (o:Org)-[:MEMBER_OF]-(s:Staff) return o.code as o,count(s) as c;'
    sDic = {}
    for res in session.run(com):
        sDic[res['o']] = res['c']
    return sDic
Ejemplo n.º 4
0
def pubs_per_person():
    session = neo4j_functions.connect()
    com = 'match (s:Staff)-[:PUBLISHED]-(p:Publication) return s.person_id as p,count(p) as c;'
    pDic = {}
    for res in session.run(com):
        pDic[res['p']] = res['c']
    return pDic
Ejemplo n.º 5
0
def add_metrics_to_graph():
    session = neo4j_functions.connect()
    with open('output/pubMetrics.txt', 'rb') as p:
        for line in p:
            line = line.rstrip()
            pid, cf, ld, ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard = line.split(
                '\t')
            com = "match (p:Publication) where p.pub_id = "+str(pid)+" set p.cf = "+str("%.4f" % cf)+",p.ld ="+str("%.4f" % ld)+"" \
                ",p.ts_flesch_reading_ease="+str("%.4f" % ts_flesch_reading_ease)+",p.ts_smog_index="+str("%.4f" % ts_smog_index)+",p.ts_flesch_kincaid_grade="+str("%.4f" % ts_flesch_kincaid_grade)+"" \
                   ",p.ts_coleman_liau_index="+str("%.4f" % ts_coleman_liau_index)+",p.ts_automated_readability_index="+str("%.4f" % ts_automated_readability_index)+"" \
                ",p.ts_dale_chall_readability_score="+str("%.4f" % ts_dale_chall_readability_score)+",p.ts_difficult_words="+str("%.4f" % ts_difficult_words)+"" \
                ",p.ts_linsear_write_formula="+str("%.4f" % ts_linsear_write_formula)+",p.ts_gunning_fog="+str("%.4f" % ts_gunning_fog)+",p.ts_text_standard='"+str(ts_text_standard)+"';"
            #print com
            session.run(com)
    session.close()
Ejemplo n.º 6
0
def get_metrics():

    #check for previous
    pubDic = {}
    if os.path.exists('output/pubMetrics.txt'):
        with open('output/pubMetrics.txt', 'rb') as p:
            for line in p:
                if len(line.split('\t')) == 13:
                    pid, cf, ld, ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard = line.split(
                        '\t')
                    pubDic[int(pid)] = ''
    #print pubDic
    print "Found " + str(len(pubDic)) + " entries already processed"
    session = neo4j_functions.connect()
    com = "match (p:Publication) return distinct p.abstract as a, p.pub_id as pid;"
    counter = 0
    o = open('output/pubMetrics.txt', 'a')
    for res in session.run(com):
        if counter % 1000 == 0:
            print counter, strftime("%Y-%m-%d %H:%M:%S", gmtime())
            #session.close()
            #session = neo4j_functions.connect()
        counter += 1
        abs = res['a']
        pid = res['pid']
        #print pid
        if pid not in pubDic:
            #content fraction
            if len(abs) > 0:
                cf = nltk_functions.content_fraction(abs)
                ld = nltk_functions.lexical_diversity(abs)
                ts = nltk_functions.run_textstat(abs)
                ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard = ts
                #print pid, cf, ld, ts_flesch_reading_ease
                o.write(
                    str(pid) + "\t" + str("%.4f" % cf) + "\t" +
                    str("%.4f" % ld) + "\t" +
                    str("%.4f" % ts_flesch_reading_ease) + ""
                    "\t" + str("%.4f" % ts_smog_index) + "\t" +
                    str("%.4f" % ts_flesch_kincaid_grade) + "\t" +
                    str("%.4f" % ts_coleman_liau_index) + ""
                    "\t" + str("%.4f" % ts_automated_readability_index) +
                    "\t" + str("%.4f" % ts_dale_chall_readability_score) + ""
                    "\t" + str("%.4f" % ts_difficult_words) + "\t" +
                    str("%.4f" % ts_linsear_write_formula) + "\t" +
                    str("%.4f" % ts_gunning_fog) + ""
                    "\t" + str(ts_text_standard) + "\n")
Ejemplo n.º 7
0
def distance_metrics(year):
    session = neo4j_functions.connect()
    #staff
    print "Creating staff distance data"
    com = "match (p1:Staff)-[e1:ENRICHED]->(s), (p2:Staff)-[e2:ENRICHED]->(s) where id(p1) < id(p2) and e1.year = " + str(
        year
    ) + " and e2.year = " + str(
        year
    ) + " with sqrt(sum((-log10(e1.cpval) - -log10(e2.cpval))^2)) as euc,p1,p2 merge (p1)-[d:DISTANCE]-(p2) set d.euclidean_" + str(
        year) + " = euc return count(euc);"
    print com
    for res in session.run(com):
        print res
    #orgs
    print "Create org distance data"
    com = "match (p1:Org)-[e1:ENRICHED]->(s), (p2:Org)-[e2:ENRICHED]->(s) where id(p1) < id(p2) and e1.year = " + str(
        year
    ) + " and e2.year = " + str(
        year
    ) + " with sqrt(sum((-log10(e1.cpval) - -log10(e2.cpval))^2)) as euc,p1,p2 merge (p1)-[d:DISTANCE]-(p2) set d.euclidean_" + str(
        year) + " = euc return count(euc);"
    print com
    for res in session.run(com):
        print res
Ejemplo n.º 8
0
def copy_graph_to_mysql():
    session = neo4j_functions.connect()
    cnx = connect()
    curA = cnx.cursor(buffered=True)

    #add orgs
    print "Adding orgs"
    neo4j_com = "match (o:Org) return o.code as c, o.short_name as s, o.full_name as f, o.url as u, o.type as t order by o.code;"
    mysql_com = (
        "INSERT IGNORE INTO browser_org (code, short_name, full_name, url, type) "
        "VALUES (%s, %s, %s, %s, %s)")
    for res in session.run(neo4j_com):
        code = res['c']
        short = res['s']
        full = res['f']
        url = res['u']
        if len(url) == 0:
            url = 'n/a'
        type = res['t']
        #print code,short,full,url,type
        try:
            curA.execute(mysql_com, (code, short, full, url, type))
        except mysql.connector.Error as err:
            print("failed to insert values %s, %s")

    #person table
    #name,user_name,institute,position,sex
    print "Adding people"
    neo4j_com = "match (o:Org)--(s:Staff) return s.published_name as n, s.person_id as pid, o.code as s;"
    mysql_com = (
        "INSERT IGNORE INTO browser_person (org_id, user_name, name, position, sex) "
        "VALUES ((SELECT id from browser_org where code = %s), %s, %s, %s, %s)"
    )
    for res in session.run(neo4j_com):
        name = res['n']
        #print name
        user_name = res['pid']
        org = res['s']
        pos = 'n/a'
        sex = 'n/a'
        curA.execute(mysql_com, (org, user_name, name, pos, sex))

    #concepts
    #name,type
    print "Adding concepts"
    neo4j_com = "match (c:Concept) return c.name as n, c.type as t"
    mysql_com = ("INSERT IGNORE INTO browser_concept (name, type) "
                 "VALUES (%s, %s)")
    for res in session.run(neo4j_com):
        name = res['n']
        #print name
        type = res['t']
        curA.execute(mysql_com, (name, type))

    #enrichments
    #person_id,globalCount,cpval,concept_id,globalTotal,localCount,localTotal,year
    print "Adding people-concepts"
    neo4j_com = "match (s:Staff)-[e]-(c:Concept) return s.person_id as pid,c.name, e.localCount,e.localTotal,e.globalCount,e.globalTotal,e.year,e.cpval;"
    mysql_com = (
        "INSERT IGNORE INTO browser_enrichedp (person_id,concept_id,localCount,localTotal,globalCount,globalTotal,year,cpval) "
        "VALUES ((SELECT id from browser_person where user_name = %s), (SELECT id from browser_concept where name = %s), %s, %s, %s, %s, %s, %s )"
    )
    for res in session.run(neo4j_com):
        pid = res['pid']
        cName = res['c.name']
        localCount = res['e.localCount']
        localTotal = res['e.localTotal']
        globalCount = res['e.globalCount']
        globalTotal = res['e.globalTotal']
        year = res['e.year']
        cpval = res['e.cpval']
        #print pid,cName
        curA.execute(mysql_com, (pid, cName, localCount, localTotal,
                                 globalCount, globalTotal, year, cpval))

    print "Adding org-concepts"
    neo4j_com = "match (o:Org)-[e]-(c:Concept) return o.code as c,c.name, e.localCount,e.localTotal,e.globalCount,e.globalTotal,e.year,e.cpval;"
    mysql_com = (
        "INSERT IGNORE INTO browser_enrichedo (org_id,concept_id,localCount,localTotal,globalCount,globalTotal,year,cpval) "
        "VALUES ((SELECT id from browser_org where code = %s), (SELECT id from browser_concept where name = %s), %s, %s, %s, %s, %s, %s )"
    )
    for res in session.run(neo4j_com):
        pid = res['c']
        cName = res['c.name']
        localCount = res['e.localCount']
        localTotal = res['e.localTotal']
        globalCount = res['e.globalCount']
        globalTotal = res['e.globalTotal']
        year = res['e.year']
        cpval = res['e.cpval']
        #print pid, cName
        curA.execute(mysql_com, (pid, cName, localCount, localTotal,
                                 globalCount, globalTotal, year, cpval))

    cnx.close()
Ejemplo n.º 9
0
def add_enriched_to_graph(year):
    print "Adding enrichment data to graph..."
    session = neo4j_functions.connect()
    for concept_type in ['type', 'bigram', 'trigram']:
        print "Adding data for data type '" + concept_type + "'"
        counter = 0
        num_lines = sum(1 for line in open(outDir + '/person_' + concept_type +
                                           '_enriched_' + str(year) + '.txt'))

        #create concept nodes and index
        with open(
                outDir + '/person_' + concept_type + '_enriched_' + str(year) +
                '.txt', 'rb') as p:
            next(p)
            for line in p:
                line = line.rstrip()
                #print line
                if counter % 10000 == 0:
                    print str(counter) + '/' + str(num_lines)
                    session.close()
                    session = neo4j_functions.connect()
                counter += 1
                name, type, a1, a2, b1, b2, odds, pval, cor_pval = line.split(
                    '\t')
                type = type.replace("'", "\\'")
                person_id = name.split(':')[1]
                com = "MERGE (c:Concept {name:'" + type + "', type:'" + concept_type + "'})"
                session.run(com)

        i = "CREATE index on :Concept(name);"
        session.run(i)
        #i="CREATE index on :Concept(type);"
        #session.run(i)

        counter = 0
        #create person-concept relationships
        print "Creating person-concept relationships..."
        with open(
                outDir + '/person_' + concept_type + '_enriched_' + str(year) +
                '.txt', 'rb') as p:
            next(p)
            for line in p:
                line = line.rstrip()
                #print line
                if counter % 10000 == 0:
                    print str(counter) + '/' + str(num_lines)
                    session.close()
                    session = neo4j_functions.connect()
                counter += 1
                name, type, a1, a2, b1, b2, odds, pval, cor_pval = line.split(
                    '\t')
                if float(cor_pval) > 0:
                    cor_pval = ("%03.02e" % float(cor_pval))
                else:
                    cor_pval = 0.0
                type = type.replace("'", "\\'")
                person_id = name.split(':')[1]
                com = "MATCH (s:Staff {person_id: "+person_id+"}) " \
                   "MATCH (c:Concept {name:'"+type+"',type:'"+concept_type+"'}) " \
                   "MERGE (s)-[:ENRICHED{type:'pure-comp',year:"+str(year)+",localCount:"+str(a1)+"," \
                   "localTotal:"+str(a2)+",globalCount:"+str(b1)+",globalTotal:"+str(b2)+",cpval:"+str(cor_pval)+"}]-(c);"
                session.run(com)

        counter = 0
        #create org-concept relationships
        print "Creating org-concept relationships..."
        with open(
                outDir + '/org_' + concept_type + '_enriched_' + str(year) +
                '.txt', 'rb') as p:
            next(p)
            for line in p:
                if counter % 10000 == 0:
                    print counter
                    session.close()
                    session = neo4j_functions.connect()
                counter += 1
                line = line.rstrip()
                name, type, a1, a2, b1, b2, odds, pval, cor_pval = line.split(
                    '\t')
                if float(cor_pval) > 0:
                    cor_pval = ("%03.02e" % float(cor_pval))
                else:
                    cor_pval = 0.0
                type = type.replace("'", "\\'")
                code = name.split(':')[1]
                com = "MATCH (o:Org {code: '"+code+"'}) " \
                   "MATCH (c:Concept {name:'"+type+"',type:'"+concept_type+"'}) " \
                   "MERGE (o)-[:ENRICHED{type:'pure-comp',year:"+str(year)+",localCount:"+str(a1)+"," \
                   "localTotal:"+str(a2)+",globalCount:"+str(b1)+",globalTotal:"+str(b2)+",cpval:"+str(cor_pval)+"}]-(c);"
                #print com
                session.run(com)
    session.close()
Ejemplo n.º 10
0
def background_frequencies(year):
    print "Getting background frequencies"
    session = neo4j_functions.connect()
    o1 = open(outDir + '/background_type_frequencies_' + str(year) + '.txt',
              'w')
    o1.write('type\tcount\n')
    o2 = open(outDir + '/background_bigram_frequencies_' + str(year) + '.txt',
              'w')
    o2.write('bigram\tcount\n')
    o3 = open(outDir + '/background_trigram_frequencies_' + str(year) + '.txt',
              'w')
    o3.write('trigram\tcount\n')

    com = "match (s:Staff)--(p:Publication) where p.pub_year <= " + str(
        year
    ) + " return distinct p.abstract as a, p.title as t, p.pub_id as pid;"
    print com
    typeDic = {}
    bigramDic = {}
    trigramDic = {}
    pubConceptDic = defaultdict(dict)
    counter = 0
    ignoreList = ignore_tokens()
    for res in session.run(com):
        if counter % 1000 == 0:
            print counter
        counter += 1
        if len(res['a']) > config.minAbsLength:
            #combine title and abstract
            title_abs = res['t'] + res['a']
            #abs = res['a']
            pid = res['pid']
            types = nltk_functions.tokenise_and_lemm(title_abs)
            #bigrams,trigrams = nltk_functions.bigrams_and_trigrams(abs)
            bigrams, trigrams = nltk_functions.bigrams_and_trigrams(
                " ".join(types))
            for s in set(types):
                if s in ignoreList:
                    pass
                else:
                    #add to pub-concept dic
                    pubConceptDic[pid][s] = 'type'
                    if s in typeDic:
                        typeDic[s] += 1
                    else:
                        typeDic[s] = 1
            for s in set(bigrams):
                if len(set(s).intersection(ignoreList)) == 0:
                    ss = s[0] + ':' + s[1]
                    #add to pub-concept dic
                    pubConceptDic[pid][ss] = 'bigram'
                    if ss in bigramDic:
                        bigramDic[ss] += 1
                    else:
                        bigramDic[ss] = 1
            for s in set(trigrams):
                if len(set(s).intersection(ignoreList)) == 0:
                    ss = s[0] + ':' + s[1] + ':' + s[2]
                    #add to pub-concept dic
                    pubConceptDic[pid][ss] = 'trigram'
                    if ss in trigramDic:
                        trigramDic[ss] += 1
                    else:
                        trigramDic[ss] = 1

    for t in sorted(typeDic, key=typeDic.get, reverse=True):
        o1.write(t + '\t' + str(typeDic[t]) + '\n')
    for t in sorted(bigramDic, key=bigramDic.get, reverse=True):
        o2.write(t + '\t' + str(bigramDic[t]) + '\n')
    for t in sorted(trigramDic, key=trigramDic.get, reverse=True):
        o3.write(t + '\t' + str(trigramDic[t]) + '\n')

    o4 = open(outDir + '/pub_concept_' + str(year) + '.txt', 'w')
    l = len(pubConceptDic)
    counter = 0
    for p in pubConceptDic:
        if counter % 1000 == 0:
            print counter, l
        counter += 1
        for c in pubConceptDic[p]:
            o4.write(str(p) + '\t' + c + "\t" + pubConceptDic[p][c] + '\n')

    session.close()
Ejemplo n.º 11
0
def org_frequencies(year):
    print "Getting org frequencies"
    session = neo4j_functions.connect()
    o1 = open(outDir + '/org_type_frequencies_' + str(year) + '.txt', 'w')
    o1.write('org\ttype\tcount\n')
    o2 = open(outDir + '/org_bigram_frequencies_' + str(year) + '.txt', 'w')
    o2.write('org\tbigram\tcount\n')
    o3 = open(outDir + '/org_trigram_frequencies_' + str(year) + '.txt', 'w')
    o3.write('org\ttrigram\tcount\n')
    com = "match (o:Org)--(s:Staff)--(p:Publication) where p.pub_year <= " + str(
        year
    ) + " return distinct o.short_name as o1, o.code as o2, p.pub_id as pid;"
    typeDic = defaultdict(dict)
    bigramDic = defaultdict(dict)
    trigramDic = defaultdict(dict)
    counter = 0
    ignoreList = ignore_tokens()
    #read in type freqs for filtering
    typeConceptDic = typeFreqs(year)
    #get pubConceptDic
    pubConceptDic = readPubConcepts(year)
    for res in session.run(com):
        if counter % 1000 == 0:
            print counter
        counter += 1
        org = res['o1'] + ':' + res['o2']
        pid = str(res['pid'])
        for s in pubConceptDic[pid]:
            if s in ignoreList:
                pass
            else:
                if pubConceptDic[pid][s] == 'type':
                    if int(typeConceptDic[s]) > freqCutoff:
                        #print pid,s,pubConceptDic[pid][s],typeConceptDic[s]
                        pass
                    else:
                        if s in typeDic[org]:
                            typeDic[org][s] += 1
                        else:
                            typeDic[org][s] = 1
                elif pubConceptDic[pid][s] == 'bigram':
                    if s in bigramDic[org]:
                        bigramDic[org][s] += 1
                    else:
                        bigramDic[org][s] = 1
                elif pubConceptDic[pid][s] == 'trigram':
                    if s in trigramDic[org]:
                        trigramDic[org][s] += 1
                    else:
                        trigramDic[org][s] = 1

    for p in sorted(typeDic, key=typeDic.get, reverse=True):
        for t in sorted(typeDic[p], key=typeDic[p].get, reverse=True):
            o1.write(str(p) + '\t' + t + '\t' + str(typeDic[p][t]) + '\n')
    for p in sorted(bigramDic, key=bigramDic.get, reverse=True):
        for t in sorted(bigramDic[p], key=bigramDic[p].get, reverse=True):
            o2.write(str(p) + '\t' + t + '\t' + str(bigramDic[p][t]) + '\n')
    for p in sorted(trigramDic, key=trigramDic.get, reverse=True):
        for t in sorted(trigramDic[p], key=trigramDic[p].get, reverse=True):
            o3.write(str(p) + '\t' + t + '\t' + str(trigramDic[p][t]) + '\n')
    session.close()
Ejemplo n.º 12
0
def person_frequencies(year):
    print "Getting person frequencies"
    session = neo4j_functions.connect()
    o1 = open(outDir + '/person_type_frequencies_' + str(year) + '.txt', 'w')
    o1.write('person\ttype\tcount\n')
    o2 = open(outDir + '/person_bigram_frequencies_' + str(year) + '.txt', 'w')
    o2.write('person\tbigram\tcount\n')
    o3 = open(outDir + '/person_trigram_frequencies_' + str(year) + '.txt',
              'w')
    o3.write('person\ttrigram\tcount\n')

    #com = "match (s:Staff)--(p:Publication) where p.pub_id = 2942913 return distinct s.published_name as p1, s.person_id as p2, p.pub_id as pid;"
    com = "match (s:Staff)--(p:Publication) where p.pub_year <= " + str(
        year
    ) + " return distinct s.published_name as p1, s.person_id as p2, p.pub_id as pid;"
    typeDic = defaultdict(dict)
    bigramDic = defaultdict(dict)
    trigramDic = defaultdict(dict)
    ignoreList = ignore_tokens()
    counter = 0
    #read in type freqs for filtering
    typeConceptDic = typeFreqs(year)
    #get pubConceptDic
    pubConceptDic = readPubConcepts(year)
    for res in session.run(com):
        if counter % 1000 == 0:
            print counter
        counter += 1
        person_name = res['p1']
        person_id = res['p2']
        pid = str(res['pid'])
        person = person_name + ':' + str(person_id)
        #print res['pid']
        for s in pubConceptDic[pid]:
            if s in ignoreList:
                pass
            else:
                if pubConceptDic[pid][s] == 'type':
                    if int(typeConceptDic[s]) > freqCutoff:
                        #print pid,s,pubConceptDic[pid][s],typeConceptDic[s]
                        pass
                    else:
                        if s in typeDic[person]:
                            typeDic[person][s] += 1
                        else:
                            typeDic[person][s] = 1
                elif pubConceptDic[pid][s] == 'bigram':
                    if s in bigramDic[person]:
                        bigramDic[person][s] += 1
                    else:
                        bigramDic[person][s] = 1
                elif pubConceptDic[pid][s] == 'trigram':
                    if s in trigramDic[person]:
                        trigramDic[person][s] += 1
                    else:
                        trigramDic[person][s] = 1

    for p in sorted(typeDic, key=typeDic.get, reverse=True):
        for t in sorted(typeDic[p], key=typeDic[p].get, reverse=True):
            o1.write(str(p) + '\t' + t + '\t' + str(typeDic[p][t]) + '\n')
    for p in sorted(bigramDic, key=bigramDic.get, reverse=True):
        for t in sorted(bigramDic[p], key=bigramDic[p].get, reverse=True):
            o2.write(str(p) + '\t' + t + '\t' + str(bigramDic[p][t]) + '\n')
    for p in sorted(trigramDic, key=trigramDic.get, reverse=True):
        for t in sorted(trigramDic[p], key=trigramDic[p].get, reverse=True):
            o3.write(str(p) + '\t' + t + '\t' + str(trigramDic[p][t]) + '\n')

    session.close()
Ejemplo n.º 13
0
#import matplotlib
#matplotlib.style.use('ggplot')
from csv import reader
import pandas as pd
import numpy as np
from nltk.corpus import wordnet as wordnet
from textstat.textstat import textstat
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

session = neo4j_functions.connect()

#analyse output
# awk -F '\t' '$2>50' output/nltk_counts.txt | sort -t$'\t' -k4 -nr | less

#http://www.nltk.org/book/ch01.html


def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)


def filter_stopwords_and_length(text, wordLength):
    stopwords = nltk.corpus.stopwords.words('english')