def get_expanded_family_dict(sim_metric="path", synset_names=False, unique=True): from brainbehavior.nlp import do_stem families = get_families(sim_metric=sim_metric, synset_names=synset_names) if unique == True: # Finding an identical stem means that we would need to tell the word apart based # on context (eg, sensitive as a noun vs. an adjective. Since this method cannot # do that (the stems are both "sensit"), we have to simply merge the families family_index = get_family_index(families) # Combine families, and save list of all stems (including family) combined_families = dict() for stem, indices in family_index.iteritems(): direction = [] family = [] similarity = [] for idx in indices: direction = direction + families[idx]["direction"] family = family + do_stem(families[idx]["family"]) similarity = similarity + families[idx]["similarity"] combined_families[stem] = { "base": stem, "family": family, "direction": direction, "similarity": similarity } return combined_families else: return families
def get_expanded_family_dict(sim_metric="path",synset_names=False,unique=True): from brainbehavior.nlp import do_stem families = get_families(sim_metric=sim_metric,synset_names=synset_names) if unique==True: # Finding an identical stem means that we would need to tell the word apart based # on context (eg, sensitive as a noun vs. an adjective. Since this method cannot # do that (the stems are both "sensit"), we have to simply merge the families family_index = get_family_index(families) # Combine families, and save list of all stems (including family) combined_families = dict() for stem, indices in family_index.iteritems(): direction = [] family = [] similarity = [] for idx in indices: direction = direction + families[idx]["direction"] family = family + do_stem(families[idx]["family"]) similarity = similarity + families[idx]["similarity"] combined_families[stem] = {"base":stem, "family":family, "direction":direction, "similarity":similarity} return combined_families else: return families
def get_expanded_behavior_list(sim_metric="path",synset_names=False): families = get_families(sim_metric=sim_metric,synset_names=synset_names) family_index = get_family_index(families) allstems = [] for stem, indices in family_index.iteritems(): allstems.append(stem) for idx in indices: allstems = allstems + do_stem(families[idx]["family"]) return numpy.unique(allstems).tolist()
def get_expanded_behavior_list(sim_metric="path", synset_names=False): families = get_families(sim_metric=sim_metric, synset_names=synset_names) family_index = get_family_index(families) allstems = [] for stem, indices in family_index.iteritems(): allstems.append(stem) for idx in indices: allstems = allstems + do_stem(families[idx]["family"]) return numpy.unique(allstems).tolist()
def get_family_index(families): from brainbehavior.nlp import do_stem stems = [] for f in range(0,len(families)): family = families[f] if isinstance(family["base"],str): stems.append(do_stem(family["base"])[0]) else: stems.append(do_stem([family["base"].name().split(".")[0]])[0]) stems = numpy.unique(stems).tolist() family_index = dict((s, []) for s in stems) for f in range(0,len(families)): family = families[f] if isinstance(family["base"],str): stem = do_stem(family["base"])[0] holder = family_index[stem] holder.append(f) family_index[stem] = holder else: stem = do_stem([family["base"].name().split(".")[0]])[0] holder = family_index[stem] holder.append(f) family_index[stem] = holder return family_index
def get_family_index(families): from brainbehavior.nlp import do_stem stems = [] for f in range(0, len(families)): family = families[f] if isinstance(family["base"], str): stems.append(do_stem(family["base"])[0]) else: stems.append(do_stem([family["base"].name().split(".")[0]])[0]) stems = numpy.unique(stems).tolist() family_index = dict((s, []) for s in stems) for f in range(0, len(families)): family = families[f] if isinstance(family["base"], str): stem = do_stem(family["base"])[0] holder = family_index[stem] holder.append(f) family_index[stem] = holder else: stem = do_stem([family["base"].name().split(".")[0]])[0] holder = family_index[stem] holder.append(f) family_index[stem] = holder return family_index
def setUp(self): term_pickle = "../brainbehavior/data/cognitiveatlas/behavioraltraits.pkl" self.terms = pickle.load(open(term_pickle,"rb")) self.stems = do_stem(self.terms)
import sys # Here is the path to the folder with xml files topfolder = sys.argv[1] subfolder = sys.argv[2] term_pickle = sys.argv[3] outfolder = sys.argv[4] folder = "%s/%s" % (topfolder, subfolder) # Get compressed files in folder. zips = glob("%s/*.tar.gz" % folder) # Read in our terms terms = pickle.load(open(term_pickle, "rb")) stems = do_stem(terms) # We will save counts and total words dfcount = pandas.DataFrame(columns=stems) totalwords = [] # We want to keep a count of those with no terms noterms = 0 for z in zips: zname = "%s/%s" % (subfolder, os.path.basename(z)) text = get_xml_tree(z) counts = get_term_counts(terms, text) # Only save if we have at least one! if counts["count"].sum() > 0: totalwords.append(get_total_words(text))
if len(current.body)>0: content.append(current.body) ids.append(current.fullname) scores.append(current.score) except: print "Skipping %s" %sub.fullname print "%s has %s entities" %(disorder,len(content)) result = {"content":content,"disorder":disorder,"score":scores,"uids":ids,"retrieved":now} pickle.dump(result,open("analysis/reddit/%s_dict.pkl" %disorder,"wb")) ### 2. COUNT TERMS ################################################################## term_pickle = "brainbehavior/data/cognitiveatlas/behavioraltraits.pkl" terms = pickle.load(open(term_pickle,"rb")) stems = do_stem(terms) pickles = glob("%s/*_dict.pkl" %outfolder) for result_file in pickles: result = pickle.load(open(result_file,"rb")) content = result["content"] print "Parsing %s" %result_file # We will save counts and total words dfcount = pandas.DataFrame(columns=stems) totalwords = [] # We want to keep a count of those with no terms noterms = 0 for t in range(0,len(content)): print "%s of %s" %(t,len(content)) text = ''.join([i if ord(i) < 128 else ' ' for i in content[t]]) counts = get_term_counts(terms,str(text))