Example #1
0
 #Choose tokenizer from nltk
 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 
 num_reviews = len(train["review"])
 
 labeled = []
 
 #Clean labeled reviews
 for i in range(0, num_reviews):
     
     if( (i+1)%1000 == 0 ):
         
         print ("Labeled Review %d of %d\n" % ( i+1, num_reviews ))
     
     #The function review_to_sentences has been defined below
     labeled.append(preProc.review_to_sentences(train.review[i], tokenizer,str(train.sentiment[i])))
 
 #Save cleaned up labeled reviews
 json.dump(labeled,open("../../data/labeledSentiFFF.json", "w"))
 
 unlabeled = []
 
 #Clean unlabeled reviews    
 for i in range(0, num_reviews):
     
     if( (i+1)%1000 == 0 ):
         
         print ("Unlabeled Review %d of %d\n" % ( i+1, num_reviews ))
     
     #The function review_to_sentences has been defined below
     unlabeled.append(preProc.review_to_sentences(unlabeled_train.review[i], tokenizer))    
Example #2
0
                                  quoting=3)

    # Load a tokenizer from nltk

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Initialize list for holding cleaned up sentences

    bagOfsentences = []

    # Parse labeled sentences and append to bagOfsentences

    print("Parsing sentences from labeled training set")
    for review in train["review"]:
        bagOfsentences.append(
            preProc.review_to_sentences(review, tokenizer, False, True, False))

    # Parse unlabeled sentences and append to bagOfsentences

    print("Parsing sentences from unlabeled set")
    for review in unlabeled_train["review"]:
        bagOfsentences.append(
            preProc.review_to_sentences(review, tokenizer, False, True, False))

    # Save bagOfsentences

    json.dump(bagOfsentences, open("../../classifier/bagOfsentences.json",
                                   "a"))

# Here is the function review_to_sentences
Example #3
0
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    num_reviews = len(train["review"])

    labeled = []

    #Clean labeled reviews
    for i in range(0, num_reviews):

        if ((i + 1) % 1000 == 0):

            print("Labeled Review %d of %d\n" % (i + 1, num_reviews))

        #The function review_to_sentences has been defined below
        labeled.append(
            preProc.review_to_sentences(train.review[i], tokenizer,
                                        str(train.sentiment[i])))

    #Save cleaned up labeled reviews
    json.dump(labeled,
              open("../../classifier/doc2vec/labeledSentiFFF.json", "a"))

    unlabeled = []

    #Clean unlabeled reviews
    for i in range(0, num_reviews):

        if ((i + 1) % 1000 == 0):

            print("Unlabeled Review %d of %d\n" % (i + 1, num_reviews))

        #The function review_to_sentences has been defined below
Example #4
0
    unlabeled_train = pd.read_csv("../../data/unlabeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
    
    # Load a tokenizer from nltk
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Initialize list for holding cleaned up sentences
    
    bagOfsentences = []
    
    # Parse labeled sentences and append to bagOfsentences
    
    print("Parsing sentences from labeled training set")
    for review in train["review"]:
        bagOfsentences.append(preProc.review_to_sentences(review, tokenizer, False, True, False))
    
    # Parse unlabeled sentences and append to bagOfsentences
    
    print("Parsing sentences from unlabeled set")
    for review in unlabeled_train["review"]:
        bagOfsentences.append(preProc.review_to_sentences(review, tokenizer, False, True, False))
    
    # Save bagOfsentences
    
    json.dump(bagOfsentences,open("../../classifier/bagOfsentences.json", "a"))

# Here is the function review_to_sentences 

def review_to_sentences(review, tokenizer, sentiment="",removeStopwords=False, removeNumbers=False, removeSmileys=False):
    """