Python read_data Examples, snowball.read_data Python Examples

Example #1

0

Show file

File: embedded_topic_modeling.py Project: coexis/snowball

def train_supervised_model():
    D = snowball.read_data()
    raw_tweets = D['tweet']
    tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets]
    # as a first pass, we'll tree these as labels.
    tags, tag_set = _get_tags(raw_tweets)

    base_model = Word2Vec()
    #workers=multiprocessing.cpu_count(),
    #    iter=3)

    # initialize a shared vocab
    base_model.build_vocab(tokenized_tweets)

    n_train = int(1 * len(tags))

    tweets_train, tweets_test = tokenized_tweets[:n_train], tokenized_tweets[
        n_train:]
    tags_train, tags_test = tags[:n_train], tags[n_train:]

    tags_to_models = {}
    for tag in tag_set:
        tweets_for_tag = _tweets_with_tags(tweets_train, tags_train, [tag])
        # train up model for this tag
        m = copy.deepcopy(base_model)
        print("training language model for tag %s with %s examples..." %
              (tag, len(tweets_for_tag)))
        m.train(tweets_for_tag)
        print("ok.")
        tags_to_models[tag] = m

    return tags_to_models
    '''

Example #2

0

Show file

File: embedded_topic_modeling.py Project: bwallace/snowball

def train_supervised_model():
    D = snowball.read_data()
    raw_tweets = D['tweet']
    tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets]
    # as a first pass, we'll tree these as labels.
    tags, tag_set = _get_tags(raw_tweets)
    
    base_model = Word2Vec()
    #workers=multiprocessing.cpu_count(),
    #    iter=3)
    
    # initialize a shared vocab
    base_model.build_vocab(tokenized_tweets)  

    n_train = int(1 * len(tags))

    tweets_train, tweets_test = tokenized_tweets[:n_train], tokenized_tweets[n_train:]
    tags_train, tags_test = tags[:n_train], tags[n_train:]

    tags_to_models = {}
    for tag in tag_set: 
        tweets_for_tag = _tweets_with_tags(tweets_train, tags_train, [tag])
        # train up model for this tag
        m = copy.deepcopy(base_model)
        print("training language model for tag %s with %s examples..." % (tag, len(tweets_for_tag)))
        m.train(tweets_for_tag)
        print("ok.")
        tags_to_models[tag] = m 

    return tags_to_models
    '''

Example #3

0

Show file

File: embedded_topic_modeling.py Project: coexis/snowball

def train_unsupervised_model(k=10,
                             alpha=.1,
                             max_iters=25,
                             convergence_threshold=.001,
                             baseline=False,
                             silent=False):
    '''
    phi is the documents-to-topics matrix  
    '''
    global base_model
    base_model = None

    D = snowball.read_data()
    raw_tweets = D['tweet_text']

    ## 12/1 -- filter?
    tags = [snowball.which_tags(t) for t in raw_tweets]
    raw_tweets = [t for i, t in enumerate(raw_tweets) if not "#crc" in tags[i]]

    tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets]

    def _seems_to_be_about_soccer(tweet):
        terms = [
            "worldcup", "ger", "usavcrc", "fra", "italia", "mexvcrc",
            "#mexvcrc", "nedvscrc", "#nedvscrc", "nedcrc", "#nedcrc",
            "itavscrc", "#itavscrc", "uruvscrc", "#uruvscrc", "worldcup2014",
            "#worldcup2014", "uruguay"
        ]
        return any([t.lower() in terms for t in tweet])

    indices_to_keep = [
        idx for idx in range(len(tokenized_tweets))
        if not _seems_to_be_about_soccer(tokenized_tweets[idx])
    ]
    raw_tweets = [raw_tweets[idx] for idx in indices_to_keep]
    tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep]

    n = len(tokenized_tweets)
    alphas = [alpha] * k
    phi = np.zeros((n, k))

    for i in range(n):
        # initialize doc rows
        phi[i, :] = np.random.dirichlet(alphas)

    # initial topic probability estimates
    pi = estimate_pi(phi)

    if not silent:
        print("initial assignments (random)...")
        print_top_tweets_for_topics(phi, raw_tweets, pi)

    iter_ = 0
    converged = False

    while not converged and iter_ < max_iters:

        #######
        # 1. update language models
        #######
        topics_to_models = retrain_language_models(tokenized_tweets,
                                                   phi,
                                                   baseline=baseline)

        #######
        # 2. re-estimate \phi
        #######
        phi = estimate_phi(tokenized_tweets, topics_to_models, pi)
        pi = estimate_pi(phi)

        #######
        # assess convergence
        #######
        if not silent:
            print_top_tweets_for_topics(phi, raw_tweets, pi, n=20)
        cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets)
        print("finished iter: %s; LL: %s" % (iter_, cur_LL))
        #print("finished iter: %s" % iter_)
        print("\n")
        iter_ += 1

    # idx0 = (-1* phi[:,0]).argsort()[:50]
    return raw_tweets, tokenized_tweets, phi, pi, topics_to_models

Example #4

0

Show file

File: embedded_topic_modeling.py Project: bwallace/snowball

def train_unsupervised_model(k=10, alpha=.1, max_iters=25, 
                                convergence_threshold=.001, 
                                baseline=False, silent=False):
    '''
    phi is the documents-to-topics matrix  
    '''
    global base_model
    base_model = None

    D = snowball.read_data()
    raw_tweets = D['tweet_text']

    ## 12/1 -- filter?
    tags = [snowball.which_tags(t) for t in raw_tweets]
    raw_tweets = [t for i,t in enumerate(raw_tweets) if 
                        not "#crc" in tags[i]]

    tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets]
   
    def _seems_to_be_about_soccer(tweet):  
        terms = ["worldcup", "ger", "usavcrc", "fra", "italia", 
                    "mexvcrc", "#mexvcrc", "nedvscrc", "#nedvscrc", 
                    "nedcrc", "#nedcrc", "itavscrc", "#itavscrc", 
                    "uruvscrc", "#uruvscrc", "worldcup2014", "#worldcup2014",
                    "uruguay"]
        return any([t.lower() in terms for t in tweet])
        
    indices_to_keep = [idx for idx in range(len(tokenized_tweets)) if 
                            not _seems_to_be_about_soccer(tokenized_tweets[idx])]
    raw_tweets = [raw_tweets[idx] for idx in indices_to_keep]
    tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep]

    n = len(tokenized_tweets)
    alphas = [alpha]*k
    phi = np.zeros((n, k))

    for i in range(n):
        # initialize doc rows
        phi[i,:] = np.random.dirichlet(alphas) 

    # initial topic probability estimates
    pi = estimate_pi(phi)

    if not silent:
        print("initial assignments (random)...")
        print_top_tweets_for_topics(phi, raw_tweets, pi)

    iter_ = 0
    converged = False 

    while not converged and iter_ < max_iters:

        #######
        # 1. update language models 
        #######
        topics_to_models = retrain_language_models(tokenized_tweets, phi, baseline=baseline)

        #######
        # 2. re-estimate \phi
        #######
        phi = estimate_phi(tokenized_tweets, topics_to_models, pi)
        pi = estimate_pi(phi)

        #######
        # assess convergence
        #######
        if not silent:
            print_top_tweets_for_topics(phi, raw_tweets, pi, n=20)
        cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets)
        print("finished iter: %s; LL: %s" % (iter_, cur_LL))
        #print("finished iter: %s" % iter_)
        print("\n")
        iter_ += 1

    # idx0 = (-1* phi[:,0]).argsort()[:50]
    return raw_tweets, tokenized_tweets, phi, pi, topics_to_models