#Choose tokenizer from nltk tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') num_reviews = len(train["review"]) labeled = [] #Clean labeled reviews for i in range(0, num_reviews): if( (i+1)%1000 == 0 ): print ("Labeled Review %d of %d\n" % ( i+1, num_reviews )) #The function review_to_sentences has been defined below labeled.append(preProc.review_to_sentences(train.review[i], tokenizer,str(train.sentiment[i]))) #Save cleaned up labeled reviews json.dump(labeled,open("../../data/labeledSentiFFF.json", "w")) unlabeled = [] #Clean unlabeled reviews for i in range(0, num_reviews): if( (i+1)%1000 == 0 ): print ("Unlabeled Review %d of %d\n" % ( i+1, num_reviews )) #The function review_to_sentences has been defined below unlabeled.append(preProc.review_to_sentences(unlabeled_train.review[i], tokenizer))
quoting=3) # Load a tokenizer from nltk tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Initialize list for holding cleaned up sentences bagOfsentences = [] # Parse labeled sentences and append to bagOfsentences print("Parsing sentences from labeled training set") for review in train["review"]: bagOfsentences.append( preProc.review_to_sentences(review, tokenizer, False, True, False)) # Parse unlabeled sentences and append to bagOfsentences print("Parsing sentences from unlabeled set") for review in unlabeled_train["review"]: bagOfsentences.append( preProc.review_to_sentences(review, tokenizer, False, True, False)) # Save bagOfsentences json.dump(bagOfsentences, open("../../classifier/bagOfsentences.json", "a")) # Here is the function review_to_sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') num_reviews = len(train["review"]) labeled = [] #Clean labeled reviews for i in range(0, num_reviews): if ((i + 1) % 1000 == 0): print("Labeled Review %d of %d\n" % (i + 1, num_reviews)) #The function review_to_sentences has been defined below labeled.append( preProc.review_to_sentences(train.review[i], tokenizer, str(train.sentiment[i]))) #Save cleaned up labeled reviews json.dump(labeled, open("../../classifier/doc2vec/labeledSentiFFF.json", "a")) unlabeled = [] #Clean unlabeled reviews for i in range(0, num_reviews): if ((i + 1) % 1000 == 0): print("Unlabeled Review %d of %d\n" % (i + 1, num_reviews)) #The function review_to_sentences has been defined below
unlabeled_train = pd.read_csv("../../data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # Load a tokenizer from nltk tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Initialize list for holding cleaned up sentences bagOfsentences = [] # Parse labeled sentences and append to bagOfsentences print("Parsing sentences from labeled training set") for review in train["review"]: bagOfsentences.append(preProc.review_to_sentences(review, tokenizer, False, True, False)) # Parse unlabeled sentences and append to bagOfsentences print("Parsing sentences from unlabeled set") for review in unlabeled_train["review"]: bagOfsentences.append(preProc.review_to_sentences(review, tokenizer, False, True, False)) # Save bagOfsentences json.dump(bagOfsentences,open("../../classifier/bagOfsentences.json", "a")) # Here is the function review_to_sentences def review_to_sentences(review, tokenizer, sentiment="",removeStopwords=False, removeNumbers=False, removeSmileys=False): """