Esempio n. 1
0
def build_lemma_vocab(append_string=''):
    """
    Builds entire vocabulary from a dataframe of reviews.
    """
    import process_text

    # Restore data from file
    reviews = pd.read_pickle('../data/pandas/review'+append_string+'.pkl')

    vocab = {}
    print len(reviews)
    count = 0
    for index,row in reviews.iterrows():
        lemmas = process_text.text2lemmas(row.text)
        for lemma in vocab:
            try:
                vocab[lemma] += 1
            except:
                vocab[lemma] = 1
        count += 1
        if count % 100==0: print count

    # Save vocabulary to file
    vocab = pd.Series(vocab)
    vocab.to_pickle('../data/pandas/vocab'+append_string+'.pkl')

    return True
Esempio n. 2
0
def add_lemmas2pandas(type_string='', append_string=''):
    """
    Adds lemmatized text as an extra column in the reviews and/or sentences
    databases.

    type_string = {'review'|'sentences'}  determines whether to lemmatize
                  text in pandas dataframe of reviews or sentences
    """
    import process_text

    # Error handling
    if type_string not in ['review','sentences']:
        print "Error in add_lemmas2pandas:"
        print "    type_string must be either 'review' or 'sentences'"
        print "    please try again"
        return None

    # Lemmatize each row in the dataframe
    dataframe = pd.read_pickle('../data/pandas/'+type_string+append_string+'.pkl')
    lemmatized_text = []
    count = 0
    for item in dataframe.index:
        #if count<63400: 
        #    count += 1
        #    continue
        if count%1000==0: print count
        thisitem = dataframe.loc[item]
        lemmatized_text.append( process_text.text2lemmas(thisitem.text) )
        count += 1

    # Add lemmatized text back as a column in the dataframe
    lemmatized_text = pd.Series(lemmatized_text,index=dataframe.index)
    dataframe['lemmas'] = lemmatized_text

    # Save dataframe to file
    dataframe.to_pickle('../data/pandas/'+type_string+'_lemmas'+append_string+'.pkl')

    return True