def read_csv_file(source_file):
    topic_models_lists = {}
    with open(source_file,"rb") as f:
        spamreader = csv.reader(f)
        for row in spamreader:
            labels = row[1].split()
            single_model = Model(remove_stopwords=False,text_string=row[2],
                                 need_stem=True, input_stemmed=True)
            single_model.to_dirichlet()
            for topic in labels:
                if topic not in topic_models_lists:
                    topic_models_lists[topic] = []
                
                topic_models_lists[topic].append(single_model)

    topic_models = {}
    for topic in topic_models_lists:
        topic_models[topic] = Model(remove_stopwords=False,
                                    need_stem=True, input_stemmed=True)
        for single_model in topic_models_lists[topic]:
            topic_models[topic] += single_model

        topic_models[topic].to_dirichlet()

    print "Finished Reading models"
    return topic_models
def get_model_for_entities(source_dir):
    models = {}
    for instance in os.walk(source_dir).next()[2]:
        data = json.load(open(os.path.join(source_dir, instance)))
        for entity_type in data:
            if entity_type not in models:
                temp = Model(True, need_stem=True)
                temp.normalize()
                models[entity_type] = temp
            for entity in data[entity_type]:
                temp = Model(True,
                             text_dict=data[entity_type][entity],
                             need_stem=True,
                             input_stemmed=True)
                temp.normalize()
                models[entity_type] += temp
    return models
Example #3
0
def get_all_words(example_result_tuples):

    word_model = Model(True, need_stem=True)

    for single_tuple in example_result_tuples:
        word_model += Sentence(single_tuple['sentence'],
                               remove_stopwords=True).stemmed_model

    word_model.normalize()

    return word_model
Example #4
0
def get_all_words(result_tuples):

    word_model = Model(False, need_stem=False)

    for single_tuple in result_tuples:
        word_model += Sentence(single_tuple['sentence'],
                               remove_stopwords=False).raw_model

    word_model.to_dirichlet()

    return word_model
def get_all_words(tuple_results):
    words = {}
    for identifier in tuple_results:
        word_model = Model(True,need_stem=True)
        for single_tuple in tuple_results[identifier]:
            word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model

        word_model.normalize()
        for word in word_model.model:
            if word not in words:
                words[word] = 0
            words[word] += word_model.model[word]
    return words
def get_verbs(tuple_result):
    verbs = {}
    for identifier in tuple_result:
        verb_model = Model(True)
        for single_tuple in tuple_result[identifier]:
            verb = single_tuple['verb']
            if verb not in NO_NEED:
                verb_model.update(text_list=[verb])
        verb_model.normalize()
        for verb in verb_model.model:
            if verb not in verbs:
                verbs[verb] = 0
            verbs[verb] += verb_model.model[verb]
    return verbs
Example #7
0
def get_single_model(candidate_dir):
    candidate_models = {}
    files = os.walk(candidate_dir).next()[2]
    for a_file in files:
        candidate_models[a_file] = {}
        temp_model = json.load(open(os.path.join(candidate_dir, a_file)))
        for w in temp_model:
            if w not in candidate_models:
                temp = Model(True,
                             text_dict=temp_model[w],
                             need_stem=True,
                             input_stemmed=True)
                temp.normalize()
                candidate_models[a_file][w] = temp

    return candidate_models
Example #8
0
def get_all_verbs(example_result_tuples):
    verb_model = Model(True, need_stem=True)

    for single_tuple in example_result_tuples:
        word = single_tuple['verb']
        if single_tuple['verb_label'] != 'VB':
            word = WordNetLemmatizer().lemmatize(word, 'v')
        try:
            verb_model.update(text_list=[str(word)])
        except TypeError:
            print "Wrong Word!"
            print word
            print type(word)
            print single_tuple
            sys.exit(0)
    verb_model.normalize()

    return verb_model
Example #9
0
def get_all_verbs(result_tuples):
    verb_model = Model(False, need_stem=False)

    for single_tuple in result_tuples:
        word = single_tuple['verb']
        # if single_tuple['verb_label'] != 'VB':
        #     word = WordNetLemmatizer().lemmatize(word,'v')
        try:
            verb_model.update(text_list=[str(word)])
        except TypeError:
            print "Wrong Word!"
            print word
            print type(word)
            print single_tuple
            sys.exit(0)
    verb_model.to_dirichlet()

    return verb_model
Example #10
0
def get_sentence_window(entity_map,sentence,windows):
    """
    Use the whole sentence as the context
    """
    #print sentence
    for w in entity_map:
        # if w != u'Jefferson County':
        #     continue
        # else:
        #     pass
        #     print "YES!"
        if sentence.find(w) != -1:
            #print "found sentence %s" %sentence
            if entity_map[w]:
                w = entity_map[w]
            if w not in windows:
                windows[w] = Model(True,need_stem=True)
            
            windows[w] += Sentence(re.sub("\n"," ",sentence),remove_stopwords=True).stemmed_model
Example #11
0
def get_sub_features(model, size):
    """
    get top terms as features   
    """

    data = Model(True, need_stem=True, input_stemmed=True)
    for instance in model:
        for w in model[instance]:
            data += model[instance][w]
    data.normalize()
    terms = data.model
    sorted_terms = sorted(terms.items(), key=lambda x: x[1], reverse=True)
    i = 0
    features = {}
    for (w, v) in sorted_terms:
        features[w] = v
        i += 1
        if i == size:
            print "break when i is", i
            break
    print "get %d features" % (len(features))
    return features