def read_csv_file(source_file): topic_models_lists = {} with open(source_file,"rb") as f: spamreader = csv.reader(f) for row in spamreader: labels = row[1].split() single_model = Model(remove_stopwords=False,text_string=row[2], need_stem=True, input_stemmed=True) single_model.to_dirichlet() for topic in labels: if topic not in topic_models_lists: topic_models_lists[topic] = [] topic_models_lists[topic].append(single_model) topic_models = {} for topic in topic_models_lists: topic_models[topic] = Model(remove_stopwords=False, need_stem=True, input_stemmed=True) for single_model in topic_models_lists[topic]: topic_models[topic] += single_model topic_models[topic].to_dirichlet() print "Finished Reading models" return topic_models
def get_model_for_entities(source_dir): models = {} for instance in os.walk(source_dir).next()[2]: data = json.load(open(os.path.join(source_dir, instance))) for entity_type in data: if entity_type not in models: temp = Model(True, need_stem=True) temp.normalize() models[entity_type] = temp for entity in data[entity_type]: temp = Model(True, text_dict=data[entity_type][entity], need_stem=True, input_stemmed=True) temp.normalize() models[entity_type] += temp return models
def get_all_words(example_result_tuples): word_model = Model(True, need_stem=True) for single_tuple in example_result_tuples: word_model += Sentence(single_tuple['sentence'], remove_stopwords=True).stemmed_model word_model.normalize() return word_model
def get_all_words(result_tuples): word_model = Model(False, need_stem=False) for single_tuple in result_tuples: word_model += Sentence(single_tuple['sentence'], remove_stopwords=False).raw_model word_model.to_dirichlet() return word_model
def get_all_words(tuple_results): words = {} for identifier in tuple_results: word_model = Model(True,need_stem=True) for single_tuple in tuple_results[identifier]: word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model word_model.normalize() for word in word_model.model: if word not in words: words[word] = 0 words[word] += word_model.model[word] return words
def get_verbs(tuple_result): verbs = {} for identifier in tuple_result: verb_model = Model(True) for single_tuple in tuple_result[identifier]: verb = single_tuple['verb'] if verb not in NO_NEED: verb_model.update(text_list=[verb]) verb_model.normalize() for verb in verb_model.model: if verb not in verbs: verbs[verb] = 0 verbs[verb] += verb_model.model[verb] return verbs
def get_single_model(candidate_dir): candidate_models = {} files = os.walk(candidate_dir).next()[2] for a_file in files: candidate_models[a_file] = {} temp_model = json.load(open(os.path.join(candidate_dir, a_file))) for w in temp_model: if w not in candidate_models: temp = Model(True, text_dict=temp_model[w], need_stem=True, input_stemmed=True) temp.normalize() candidate_models[a_file][w] = temp return candidate_models
def get_all_verbs(example_result_tuples): verb_model = Model(True, need_stem=True) for single_tuple in example_result_tuples: word = single_tuple['verb'] if single_tuple['verb_label'] != 'VB': word = WordNetLemmatizer().lemmatize(word, 'v') try: verb_model.update(text_list=[str(word)]) except TypeError: print "Wrong Word!" print word print type(word) print single_tuple sys.exit(0) verb_model.normalize() return verb_model
def get_all_verbs(result_tuples): verb_model = Model(False, need_stem=False) for single_tuple in result_tuples: word = single_tuple['verb'] # if single_tuple['verb_label'] != 'VB': # word = WordNetLemmatizer().lemmatize(word,'v') try: verb_model.update(text_list=[str(word)]) except TypeError: print "Wrong Word!" print word print type(word) print single_tuple sys.exit(0) verb_model.to_dirichlet() return verb_model
def get_sentence_window(entity_map,sentence,windows): """ Use the whole sentence as the context """ #print sentence for w in entity_map: # if w != u'Jefferson County': # continue # else: # pass # print "YES!" if sentence.find(w) != -1: #print "found sentence %s" %sentence if entity_map[w]: w = entity_map[w] if w not in windows: windows[w] = Model(True,need_stem=True) windows[w] += Sentence(re.sub("\n"," ",sentence),remove_stopwords=True).stemmed_model
def get_sub_features(model, size): """ get top terms as features """ data = Model(True, need_stem=True, input_stemmed=True) for instance in model: for w in model[instance]: data += model[instance][w] data.normalize() terms = data.model sorted_terms = sorted(terms.items(), key=lambda x: x[1], reverse=True) i = 0 features = {} for (w, v) in sorted_terms: features[w] = v i += 1 if i == size: print "break when i is", i break print "get %d features" % (len(features)) return features