# In[7]:


# print data.ix[0:10]
print((data.iloc[:10]['text']))
# print data['text'][2]


# In[8]:


review_sents = []
print ("Cleaning and parsing the reviews...\n")
for i in range( 0, len(data["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer)
    


# In[53]:


out = open('review_sents_1859888.pkl', 'wb')
pickle.dump(review_sents, out)
out.close()


# In[11]:


# review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
Esempio n. 2
0
    # Verify the number of reviews that were read (100,000 in total)
    print "Read %d labeled train reviews, %d labeled test reviews, " \
     "and %d unlabeled reviews\n" % (train["Paper_content"].size,
     test["Paper_content"].size, unlabeled_train["Paper_content"].size )

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for review in train["Paper_content"]:
        sentences += Word2VecUtility.review_to_sentences(review, tokenizer)

    print "Parsing sentences from unlabeled set"
    for review in unlabeled_train["Paper_content"]:
        sentences += Word2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300  # Word vector dimensionality
    min_word_count = 15  # Minimum word count

# In[7]:

# print data.ix[0:10]
print data.iloc[:10]['text']
# print data['text'][2]


# In[8]:

review_sents = []
print "Cleaning and parsing the reviews...\n"
for i in xrange( 0, len(data["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer)


# # In[53]:

out = open('review_sents_1859888.pkl', 'wb')
pickle.dump(review_sents, out)
out.close()



# # In[11]:

review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
print len(review_sents)
print review_sents[:5]