def bag_of_words():
    train = pd.read_csv("train_dummy.csv", delimiter=',')
    num_reviews = train["review"].size

    #print "Cleaning and parsing the training set movie reviews...\n"
    clean_train_reviews = []
    for i in xrange(0, num_reviews):
        clean_train_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
                                                     True)))

    #print "Creating the bag of words..\n"

    #vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
    vectorizer = TfidfVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 min_df=1,
                                 max_features=500)
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    #print train_data_features
    train_data_features = train_data_features.toarray()
    scaler = StandardScaler()
    scaler.fit(train_data_features)
    train_data_features = scaler.transform(train_data_features)

    clf = MLPClassifier(activation='logistic', hidden_layer_sizes=(500, ))

    clf = clf.fit(train_data_features, train["rating"])

    clean_test_reviews = []

    test = pd.read_csv("test_dummy.csv", delimiter=',')
    num_reviews1 = test["review"].size

    #print "Cleaning and parsing the test set movie reviews...\n"
    clean_train_reviews = []
    for i in xrange(0, num_reviews1):
        clean_test_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()
    test_data_features = scaler.transform(test_data_features)
    #print len(test_data_features)

    #print "Predicting test labels ...\n"
    result = clf.predict(test_data_features)
    check = test['rating'].values
    #print train_data_features
    #print test_data_features
    #print "The accuracy score for decision tree is"
    print accuracy_score(check, result)
    print confusion_matrix(check, result)
    print classification_report(check, result)
    '''y, predicted = check, result
Exemple #2
0
def getCleanReviews(reviews, skip=0, limit=0, dispose_percent=(0,0)):
    clean_reviews = []
    if limit == 0:
      for review in reviews["review"][skip:]:
          clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True, dispose_percent=dispose_percent))
    else:
      for review in reviews["review"][skip:limit]:
          clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True, dispose_percent=dispose_percent))
    return clean_reviews
def getCleanReviews(reviews, useSmall=None, remove_stopwords=True):
    clean_reviews = []

    if useSmall:
        for review in reviews["review"][0:useSmall]:
            clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=remove_stopwords ))
        return clean_reviews
    else:
        for review in reviews["review"]:
            clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=remove_stopwords ))
        return clean_reviews
Exemple #4
0
def clean_text_field(dataset, fieldname, translate):
    cleaned_field = []
    for i in range(0, len(dataset[fieldname])):
        if translate:
            blob = TextBlob(dataset[fieldname][i]).translate(from_lang='pt-br',
                                                             to="en")
            cleaned_field.append(" ".join(
                KaggleWord2VecUtility.review_to_wordlist(str(blob), True)))
        else:
            cleaned_field.append(" ".join(
                KaggleWord2VecUtility.review_to_wordlist(
                    dataset[fieldname][i], True)))
    return cleaned_field
Exemple #5
0
def main():
    start_time = datetime.now()

    df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
    # test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
    train, test, y, y_test = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)

    print "Cleaning and parsing movie reviews...\n"      
    traindata = []
    for i in xrange(0, len(train)):
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], False)))
    testdata = []
    for i in xrange(0, len(test)):
        testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], False)))
    print 'vectorizing... ',
    tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                          ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
    X_all = traindata + testdata
    lentrain = len(traindata)

    print "fitting pipeline... ",
    tfv.fit(X_all)
    X_all = tfv.transform(X_all)

    X = X_all[:lentrain]
    X_test = X_all[lentrain:]

    model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                               C=1, fit_intercept=True, intercept_scaling=1.0,
                               class_weight=None, random_state=None)
    print "10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=10, scoring='roc_auc'))

    print "Retrain on all training data, predicting test labels...\n"
    model.fit(X, y)
    # result = model.predict_proba(X_test)[:,1] # predict as probability
    result = model.predict(X_test)
    # output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

    # Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
    output = pd.DataFrame(data={"sentiment":y_test, "predict_sentiment":result})
    output['succeed'] = output['sentiment'] == output['predict_sentiment']

    groupby = output.groupby('succeed')
    print 'Result Evaluation'
    print groupby['sentiment'].agg(['count'])

    # Use pandas to write the comma-separated output file
    output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_linear.csv'), index=False, quoting=3)
    print "Wrote results to Bag_of_Words_model_linear.csv"

    print datetime.now() - start_time
Exemple #6
0
def bag_of_words():
    train = pd.read_csv("train_dummy.csv", delimiter=',')
    num_reviews = train["review"].size

    print "Cleaning and parsing the training set movie reviews...\n"
    clean_train_reviews = []
    for i in xrange(0, num_reviews):
        clean_train_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
                                                     True)))

    print "Creating the bag of words..\n"

    #vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
    vectorizer = TfidfVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 min_df=1)
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    #print train_data_features
    train_data_features = train_data_features.toarray()

    clf = DecisionTreeClassifier(criterion="gini", splitter="best")

    clf = clf.fit(train_data_features, train["rating"])

    clean_test_reviews = []

    test = pd.read_csv("test_dummy.csv", delimiter=',')
    num_reviews1 = test["review"].size

    print "Cleaning and parsing the test set movie reviews...\n"
    clean_train_reviews = []
    for i in xrange(0, num_reviews1):
        clean_test_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()
    #print len(test_data_features)
    tree.export_graphviz(clf, out_file='tree.dot')

    print "Predicting test labels ...\n"
    result = clf.predict(test_data_features)
    check = test['rating'].values
    #print train_data_features
    #print test_data_features
    print "The accuracy score is"
    print accuracy_score(check, result)
def generate_word2vec(model_name, dataset_list):

    sentences = []

    nltk.download()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    for i, dataset in enumerate(dataset_list):
        print("Parsing sentences from dataset " + str(i + 1))
        for review in dataset["review"]:
            sentences += KaggleWord2VecUtility.review_to_sentences(
                review, tokenizer)

    print("Training Word2Vec model...")
    model = Word2Vec(sentences, workers=NUM_WORKERS, \
                size=NUM_FEATURES, min_count = MIN_WORD_COUNT, \
                window = CONTEXT, sample = DOWNSAMPLING, seed=1)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # You can load the model later using Word2Vec.load()
    model.save(model_name)

    return model
def tokenize(sentence, grams):
    words = KaggleWord2VecUtility.review_to_wordlist(sentence)
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i : i + gram])]
    return tokens
def load_data_and_labels_kaggle2(test_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    y = []
    # Load data from files
    training_examples = pd.read_csv(test_data_file,
                                    header=0,
                                    delimiter='\t',
                                    quoting=3)
    # Generate labels
    for x in training_examples["Sentiment"]:
        if x == 1:  #positive
            test_labels = [1]
        else:  #negative
            test_labels = [0]
        y = np.concatenate([y, test_labels], 0)
    print(y)
    print(y.shape)
    print("sentiment complete")
    #print(test_examples["review"][:10])
    # preprocessing
    sentences = []
    for review in test_examples["review"]:
        tmpstr = KaggleWord2VecUtility.review_to_corpus(review,
                                                        remove_stopwords=False)
        sentences.append(tmpstr)

    print("preprocessing complete")
    return [sentences, y]
Exemple #10
0
def getCleanReviews(reviews):
    clean_reviews = []
    clean_reviews = KaggleWord2VecUtility.apply_by_multiprocessing(
        reviews["review"],
        KaggleWord2VecUtility.review_to_wordlist_with_tag,
        workers=4)
    return clean_reviews
def generate_doc2vec(model_name, dataset_list, NUM_FEATURES=100, CONTEXT=5):

    print(model_name)

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # ****** Split the training sets into clean docs
    #
    # flatten each list of sentences from a review into a single list of words
    docs = []  # Initialize an empty list of docs

    for i, dataset in enumerate(dataset_list):
        print("Parsing sentences from dataset " + str(i + 1))
        for review in dataset["review"]:
            sentences = KaggleWord2VecUtility.review_to_sentences(
                review, tokenizer)
            docs.append([word for sentence in sentences for word in sentence])

    # Initialize and train the model
    print("Training Doc2Vec model...")
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
    model = Doc2Vec(documents, workers=NUM_WORKERS, \
                vector_size=NUM_FEATURES, min_count = MIN_WORD_COUNT, \
                window = CONTEXT, seed=1)

    model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

    # You can load the model later using Doc2Vec.load()
    print(model_name)
    model.save(model_name)

    return model
Exemple #12
0
def create_task():
    if not request.json or not 'id' in request.json:
        abort(400)
    task = {
        'id': request.json['id'],
        'text': request.json['text'],
    }
    clean_test_descripciones = []
    app.logger.info('petition_classification: ' + task['text'])
    features = review_words(task['text'])
    clean_test_descripciones.append(u" ".join(
        KaggleWord2VecUtility.review_to_wordlist(features, True)))

    # Uses chord to run two jobs and a callback after processing ends
    # 1) A text classifier
    # 2) A profanity filter
    # 3) A callback to put all together in a JSON
    callback = update_remote_petition.subtask()
    chord([
        evaluate_petition.s(task['id'], clean_test_descripciones),
        catch_bad_words_in_text.s(task['text'])
    ])(callback)

    return jsonify({
        'id': request.json['id'],
        'text': request.json['text']
    }), 201
Exemple #13
0
def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(
            KaggleWord2VecUtility.review_to_wordlist(review,
                                                     remove_stopwords=True))
    return clean_reviews
Exemple #14
0
def classify(sentiment_str):
    sentiment_str = [
        " ".join(KaggleWord2VecUtility.review_to_wordlist(sentiment_str, True))
    ]
    sentiment_str = vectorizer.transform(sentiment_str)
    np.asarray(sentiment_str)
    return forest.predict(sentiment_str)
    def predict_review(self, review, dispose_percent=(0, 0)):
        tok_sents = KaggleWord2VecUtility.review_to_sentences(
            review,
            self.tokenizer,
            case_sensitive=True,
            dispose_percent=dispose_percent)

        num_sarcastic = 0
        num_regular = 0
        for sent in tok_sents:
            prediction = self.predict_sentence(sent)
            for i in range(2):
                if prediction[i] > sarcasm_confidence:
                    if self.classifier.classes_[i] == 'ironic':
                        num_sarcastic += 1
                    else:
                        num_regular += 1

        if num_regular == 0 and num_sarcastic == 0:
            return 'regular'

        #More than 'sarcasm_thres' percent of sentences must be classified sarcastic, for the review to be classified sarcastic.
        if num_sarcastic > num_regular * sarcasm_thres:
            return 'ironic'
        return 'regular'
def create_task():
    if not request.json or not 'id' in request.json:
        abort(400)
    task = {
        'id': request.json['id'],
        'text': request.json['text'],
    }
    clean_test_descripciones = []
    app.logger.info('petition_classification: ' + task['text'])
    features = review_words(task['text'])
    clean_test_descripciones.append(u" ".join(
        KaggleWord2VecUtility.review_to_wordlist(features, True)))

    # Uses chord to run two jobs and a callback after processing ends
    # 1) A text classifier
    # 2) A profanity filter
    # 3) A callback to put all together in a JSON
    callback = update_remote_petition.subtask()
    chord([
        evaluate_petition.s(task['id'], clean_test_descripciones),
        catch_bad_words_in_text.s(task['text'])
    ])(callback)

    return jsonify({'id': request.json['id'],
                    'text': request.json['text']}), 201
Exemple #17
0
def getCleanReviews(reviews):
    """
  Use multiple workers (multi threads)
  """
    clean_reviews = []
    clean_reviews = KaggleWord2VecUtility.apply_by_multiprocessing(
        reviews["review"], KaggleWord2VecUtility.review_to_wordlist, workers=4)
    return clean_reviews
def getCleanDescriptions(descriptions):
    clean_descriptions = []
    local_counter=0
    for description in descriptions["description"]:
        clean_descriptions.append( KaggleWord2VecUtility.review_to_wordlist( description, remove_stopwords=True ))
        local_counter=local_counter+1
        print('Adding line : '+str(local_counter))
    return clean_descriptions
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized
Exemple #20
0
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review))

    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized
def process_data_extend(is_char=False):
    # 加载训练集和测试集
    x_train, y_train = get_data(train_path, convert_label=False)
    x_test, y_test = get_data(test_path, convert_label=False)
    # 将文本转化为句子,每个句子是由词组成的list
    train_sentences = []  # Initialize an empty list of sentences
    test_sentences = []  # Initialize an empty list of sentences
    for news in x_train:
        if len(news) > 0:
            train_sentences.append(
                " ".join(KaggleWord2VecUtility.review_to_wordlist(news, is_char=is_char,
                                                                  remove_stopwords=True)))
    for news in x_test:
        if len(news) > 0:
            test_sentences.append(
                " ".join(KaggleWord2VecUtility.review_to_wordlist(news, is_char=is_char,
                                                                  remove_stopwords=True)))

    # 对短文本进行扩充
    train_extend = get_similar_words(train_sentences)
    test_extend = get_similar_words(test_sentences)

    # 转为数字索引形式
    all_text = train_sentences
    all_text.extend(test_sentences)
    sequences, word_index = gen_word_index(all_text, is_char=is_char)
    data = sequence.pad_sequences(sequences, maxlen=maxlen)
    X_train = data[:train_size]
    X_test = data[train_size:]
    all_text_extend = train_extend
    all_text_extend.extend(test_extend)
    sequences, word_index_extend = gen_word_index(all_text_extend, is_char=is_char)
    data_extend = sequence.pad_sequences(sequences, maxlen=maxlen)
    X_train_extend = data_extend[:train_size]
    X_test_extend = data_extend[train_size:]

    # label转成numpy数组
    Y_train = np.array(y_train)
    Y_test = np.array(y_test)
    # one-hot
    encoder = LabelBinarizer().fit(Y_train)
    Y_train = encoder.transform(Y_train)
    Y_test = encoder.transform(Y_test)
    return X_train, X_train_extend, Y_train, X_test, X_test_extend, Y_test, word_index, word_index_extend
def get_words(reviews):
    """
    Gets list of relevant words per review using Kaggle's Word2VecUtility
    https://github.com/danielfrg/kaggle-word2vec/blob/master/DeepLearningMovies/KaggleWord2VecUtility.py_
    :param reviews: list of reviews which should be transformed to words
    :return: list of words per review
    """
    words = []
    for review in reviews:
        words.append("".join(KaggleWord2VecUtility.review_to_wordlist(review)))

    return words
def bag_of_words():
    train = pd.read_csv("train_dummy.csv", delimiter=',')
    num_reviews = train["review"].size

    #print "Cleaning and parsing the training set movie reviews...\n"
    clean_train_reviews = []
    for i in xrange(0, num_reviews):
        clean_train_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
                                                     True)))

    vectorizer = TfidfVectorizer(max_features=2500, min_df=4)
    train_data_features = vectorizer.fit_transform(
        clean_train_reviews).todense()

    print train_data_features.shape
    train_output = train['rating']
    clf = SVC(C=100.0, kernel='sigmoid', cache_size=1000, gamma=1.0)
    clf = clf.fit(train_input, train_output)

    clean_test_reviews = []

    test = pd.read_csv("test_dummy.csv", delimiter=',')
    num_reviews1 = test["review"].size

    clean_train_reviews = []
    for i in xrange(0, num_reviews1):
        clean_test_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

    test_data_features = vectorizer.transform(clean_test_reviews).todense()

    result = clf.predict(test_data_features)
    check = test['rating']
    print check

    print accuracy_score(check, result)
    print confusion_matrix(check, result)
    print classification_report(check, result)
    '''y, predicted = check, result
Exemple #24
0
def perform_preprocess_on_review_data_of_train(review_data_in_train):
    sentences = []

    for review in review_data_in_train:
        processed = KaggleWord2VecUtility.review_to_sentences(
            review, remove_stopwords=False)
        # print("processed",processed)
        # [['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'start',
        #   'listen', 'to', 'his', 'music', 'watch', 'the', 'odd', 'documentari', 'here', 'and', 'there', 'watch',
        #   'the', 'wiz', 'and', 'watch',
        sentences += processed

    return sentences
Exemple #25
0
def weight_file_processing(all):
    traindata = []
    for i in range(0, len(all["text"])):
        traindata.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(all["text"][i], True)))
    all_wordlist = " ".join(traindata).split()
    counts = dict()
    for i in all_wordlist:
        counts[i] = counts.get(i, 0) + 1
    sorted_x = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
    weight_file_build = open("data/reuters_vocab.txt", "ab")
    for each in sorted_x:
        weight_file_build.write(each[0] + " " + str(each[1]) + " " + "\n")
Exemple #26
0
def create_classifier(subreddits, X_train, y_train, num_features):

    # nltk.download()  # Download text data sets, including stop words

    clean_train_set = []

    print "Cleaning and parsing the training set...\n"
    for i in xrange(0, len(X_train)):
        clean_train_set.append(KaggleWord2VecUtility.review_to_wordlist(X_train[i], True))

     # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    # num_features, min_word_count, num_workers = options

    # num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    print "Training Word2Vec model..."
    model = Word2Vec(clean_train_set, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling, seed=1)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and
    # save the model for later use. You can load it later using Word2Vec.load()
    model_name = "300features_40minwords_10context"
    model.save(model_name)

    print "Creating average feature vecs for training comments"

    train_data_features = getAvgFeatureVecs( clean_train_set, model, num_features )

    print "Training the model (this may take a while)..."

    classifier = LogisticRegression( solver='lbfgs', multi_class='multinomial')
    classifier = classifier.fit(train_data_features, y_train)

    return (model, classifier)
def getLabeledSentences(reviews, prefix, skip=0, limit=0, dispose_percent=0):
    labels = []
    index = 0
    if limit == 0:
        for review in reviews["review"][skip:]:
            index += 1
            labels.append(
                LabeledSentence(
                    KaggleWord2VecUtility.review_to_wordlist(
                        review,
                        remove_stopwords=False,
                        dispose_percent=dispose_percent),
                    [prefix + str(index)]))
    else:
        for review in reviews["review"][skip:limit]:
            index += 1
            labels.append(
                LabeledSentence(
                    KaggleWord2VecUtility.review_to_wordlist(
                        review,
                        remove_stopwords=False,
                        dispose_percent=dispose_percent),
                    [prefix + str(index)]))
    return labels
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence,               remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
Exemple #29
0
def test_classifier(model, classifier, subreddits, X_test, y_test, num_features):
    # Create an empty list and append the clean reviews one by one
    clean_test_set = []

    print "Cleaning and parsing the test set ...\n"
    for i in xrange(0, len(X_test)):
        clean_test_set.append(KaggleWord2VecUtility.review_to_wordlist(X_test[i], True))

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = getAvgFeatureVecs( clean_test_set, model, num_features )

    # Use the random forest to make sentiment label predictions
    print "Predicting test labels...\n"

    predicted = classifier.predict(test_data_features)

    print metrics.accuracy_score(y_test, predicted)
    # print metrics.confusion_matrix(y_test, predicted)
    print metrics.classification_report(y_test, predicted)
Exemple #30
0
 def review_to_sentences(review, tokenizer, remove_stopwords=False):
     # Function to split a review into parsed sentences. Returns a
     # list of sentences, where each sentence is a list of words
     #
     # 1. Use the NLTK tokenizer to split the paragraph into sentences
     raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
     #
     # 2. Loop over each sentence
     sentences = []
     for raw_sentence in raw_sentences:
         # If a sentence is empty, skip it
         if len(raw_sentence) > 0:
             # Otherwise, call review_to_wordlist to get a list of words
             sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
               remove_stopwords ))
     #
     # Return the list of sentences (each sentence is a list of words,
     # so this returns a list of lists
     return sentences
Exemple #31
0
def predict(utterance):

    clean = []

    clean.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(utterance, True)))

    #print >> sys.stderr, "kaggle ", KaggleWord2VecUtility.review_to_wordlist(utterance, True)

    features = vectorizer.transform(clean)
    np.asarray(features)
    # Use the random forest to make sentiment label predictions

    #print ("Predicting sentiment...\n")
    #print >> sys.stderr, "features ",  features

    result = model.predict(features)

    #print >> sys.stderr, "RESULT ",  result
    #print (result)

    return result[0]
def predict_sentiment(test):
    clean_test_reviews = []
    reviewstext = []
    with open('scraped.csv', 'rt') as csvfile1:
        reader = csv.reader(csvfile1)
        headers = next(reader, None)
        for row in reader:
            reviewstext.append(row[1])
            clean_test_reviews.append(" ".join(
                KaggleWord2VecUtility.review_to_wordlist(
                    (row[1]).decode('utf-8', 'ignore'), True)))
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()
    result = forest.predict(test_data_features)
    return {"review": reviewstext, "sentiment": result}
    #output = pd.DataFrame( data={"review":reviewstext, "sentiment":result} )
    #output.to_csv(os.path.join(os.path.dirname(__file__), 'Bag_of_Words_model.csv'), index=False, quoting=3, quotechar='', sep='\t')
    #output.to_csv(os.path.join(os.path.dirname(__file__), 'Bag_of_Words_model.csv'), index=False, quoting=csv.QUOTE_NONE, quotechar='', sep='\t')

    #print test
    # Create an empty list and append the clean reviews one by one
    '''
Exemple #33
0
    test_pkl = 'shortened_' + test_pkl

try:
    traindata = pickle.load(open(os.path.join(base_path, 'data',
                                              train_pkl), 'r'))
    testdata = pickle.load(open(os.path.join(base_path, 'data',
                                             test_pkl), 'r'))
except IOError as e:
    if e.errno != errno.ENOENT:
        raise e
    else:
        _logger.info('cleaning and parsing movie reviews')

        traindata = []
        for i in xrange(0, len(train["review"])):
            review = KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
                                                              False)
            if SHORT_REVIEW:
                review = review[:4]
            traindata.append(' '.join(review))
        testdata = []
        for i in xrange(0, len(test["review"])):
            review = KaggleWord2VecUtility.review_to_wordlist(test["review"][i],
                                                              False)
            if SHORT_REVIEW:
                review = review[:4]
            testdata.append(' '.join(review))

        pickle.dump(traindata, open(os.path.join(base_path, 'data',
                                                 train_pkl), 'w'))
        pickle.dump(testdata, open(os.path.join(base_path, 'data',
                                                test_pkl), 'w'))
train_i, test_i = data.ix[:,11] != -1, data.ix[:,11] == -1

train = data.ix[train_i]
test = data.ix[test_i]

#train_i, valid_i = train_test_split( np.arange( len( train )), train_size = 0.8, random_state = 88 )
#train = train.ix[train_i]
#validation = train.ix[valid_i]

#

print "Parsing train job titles..."

clean_train_reviews = []
for title in train['abstract']:
	clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))

print "Parsing test reviews..."

clean_test_reviews = []
for title in test['abstract']:
	clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))

#print "Parsing validation reviews..."

#clean_valid_reviews = []
#for title in validation['title']:
#	clean_valid_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))

#
def clean_review_function(review):
    global master_word_dict, number_of_rows
    list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False)
    return ' '.join(list_of_words)
        if x in tag:
           Y.append(tag[x])

#for i in O_test:
#test.append(O_test[i][0])

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

#print("Parsing sentences from training set")
    for review in train:
        sentences += KaggleWord2VecUtility.review_to_sentences(review.encode('utf-8'), tokenizer)

# print "Parsing sentences from training set"
        #for review in test:
# sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count

    # Initialize an empty list to hold the clean reviews
    traindata = []
    testdata = []
     
    Y1=[]
    Y2=[]
    
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    
    for i in train:
        buf=[]
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0], True)))
        for j in train[i][3].split():
            if j in tag_dic:
               buf.append(tag_dic[j])
        Y1.append(buf)



    for i in test:
        buf=[]
        testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0], True)))
        for j in test[i][3].split():
            if j in tag_dic:
               buf.append(tag_dic[j])
        Y2.append(buf)
    # ****** Create a bag of words from the training set
from sklearn import cross_validation

model = Word2Vec.load_word2vec_format(constants.GOOGLE_WORD2VEC, binary=True)

train = pd.read_csv(os.path.join(
	os.path.dirname(__file__), '..', 'fixtures', 'labeledTrainData.tsv'),
	header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
test = pd.read_csv(os.path.join(
	os.path.dirname(__file__), '..', 'fixtures', 'testData.tsv'),
	header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train["review"])):
    traindata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)))
testdata = []
for i in xrange(0, len(test["review"])):
    testdata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
vectorizer = CountVectorizer(min_df=4)
vectorizer.fit(X_all)

start = time.time()

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
Exemple #39
0
model = None
if os.path.isfile(classifier_filename):
  model = Word2Vec.load(classifier_filename)
else:
  unlabeled_train = pd.read_csv('data/unlabeledTrainData.tsv', header=0,  delimiter="\t", quoting=3)

  # Verify the number of reviews that were read (100,000 in total)
  print ("Read {0} labeled train reviews, {1} test reviews, and {2} unlabeled reviews\n".format(len(train["review"][local_test_size:]), test["review"].size, unlabeled_train["review"].size ))

  sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #Used for tokenizing paragraphs into individual sentences.

  sentences = []
  print ("Parsing sentences from training set")
  for review in train["review"][local_test_size:]:
      sentences += KaggleWord2VecUtility.review_to_sentences(review, sent_tokenizer, dispose_percent=percent_disposal)

  print ("Parsing sentences from unlabeled set")
  for review in unlabeled_train["review"]:
      sentences += KaggleWord2VecUtility.review_to_sentences(review, sent_tokenizer, dispose_percent=percent_disposal)
  del unlabeled_train

  #=======================================================================================================
  #  Train the model
  #=======================================================================================================

  # Initialize and train the model (this will take some time)
  print ("Training Word2Vec model...")
  model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling, seed=1)

  model.init_sims(replace=True) #Unloads everything from memory not related to querying the models
    #
    # Return the "bag of centroids"
    return bag_of_centroids

if __name__ == '__main__':
    train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, \
                    delimiter="\t", quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", \
                   quoting=3 )
    unlabeled_train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', "unlabeledTrainData.tsv"), header=0,  delimiter="\t", quoting=3 )
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for review in train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    print "Parsing sentences from unlabeled set"
    for review in unlabeled_train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
def clean_review_function(review):
    list_of_sentences = KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=False)
    return list_of_sentences
Exemple #42
0
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'testData.tsv'), header=0, delimiter="\t", \
               quoting=3 )
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
    traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
    testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
Exemple #43
0
def getCleanTestReviews(skucollection):
    clean_skucollection = []
    for sku in skucollection["query"]:
        clean_skucollection.append( KaggleWord2VecUtility.sku_to_wordlist( sku, remove_stopwords=False ))
    return clean_skucollection
Exemple #44
0
def getCleanTrainReviews(skucollection):
    clean_skucollection = []
    for sku in skucollection["product_title"]:
        clean_skucollection.append( KaggleWord2VecUtility.sku_to_wordlist( sku, remove_stopwords=False ))
    return clean_skucollection



    # Create clean_train_reviews and clean_test_reviews as we did before
    #

    # Read data from files
    train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )


    print "Cleaning training reviews"
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    print "Cleaning test reviews"
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))


    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["review"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
    raw_input("Press Enter to continue...")


    print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
    #nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print "Cleaning and parsing the training set movie reviews...\n"
    for i in xrange( 0, len(train["review"])):
        print KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)
        clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)))

    # ****** Create a bag of words from the training set
    #
    print "Creating the bag of words...\n"


    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 15000)
    print "Read %d labeled train skucollection " % (train["product_title"].size)



    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for sku in train["product_title"]:
        sentences += KaggleWord2VecUtility.sku_to_sentences(sku, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size
    downsampling = 1e-3   # Downsample setting for frequent words
Exemple #48
0
    csv.field_size_limit(sys.maxsize)
    # Read train data.
    train_word_vector = pd.read_pickle('all.pkl')

    # print(train_word_vector)

    # Use the NLTK tokenizer to split the paragraph into sentences.
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = []
    print "Parsing sentences from training set..."

    # Loop over each news article.
    for review in train_word_vector["text"]:
        try:
            # Split a review into parsed sentences.
            sentences += KaggleWord2VecUtility.review_to_sentences(
                review, tokenizer)
        except:
            continue

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
           level=logging.INFO)

    num_features = int(sys.argv[1])  # Word vector dimensionality
    min_word_count = 20  # Minimum word count
    num_workers = 40  # Number of threads to run in parallel
    context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    print "Training Word2Vec model..."
    # Train Word2Vec model.
    model = Word2Vec(sentences, workers=num_workers, hs = 0, sg = 1, negative = 10, iter = 25,\
def main():
    start_time = datetime.now()

    df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
    if GO_FOR_REAL:
        test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
        train = df['review']
        train_sentiment = df['sentiment']
        test_id = test['id'].str.replace('"', '')
        test = test['review']
    else:
        train, test, train_sentiment, test_sentiment = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)

    print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
    #nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print "Cleaning and parsing the training set movie reviews...\n"
    for i in xrange(0, len(train)):
        clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], True)))


    # ****** Create a bag of words from the training set
    #
    print "Creating the bag of words...\n"


    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
    # CountVectorizer transform each document into a word count vector, with each word as a feature.
    # Stop words are very frequent words in a language that may not have huge semantic impact, may dissolve the importance of other more meaning for words
    # N-gram provide word combination as a new feature.
    # stop_words: 'english' will use stop words from sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS, seem to make result more unstable.
    # max_feature: limit only the more commonly appeared words in document to be in array, None will allow all features, and increase vector size.
    # vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = 'english', max_features = None)

    # Tfidf: a normalization method to reduce weight of words that appear too frequent in dataset
    # TfidfVectorizer: CountVectorizer that run a tfidf normalization during transform
    vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                 ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    print 'Train data feature shape: ' + str(train_data_features.shape)
    print 'Number of vocabularies/features: %d\n' %len(vectorizer.get_feature_names())

    # Numpy arrays are easy to work with, so convert the result to an
    # array
    train_data_features = train_data_features.toarray()

    # ******* Train a model using the bag of words
    #
    print "Training the model (this may take a while)..."


    # Initialize a Random Forest classifier with 100 trees
    # clf = RandomForestClassifier(n_estimators=100)
    # clf = svm.LinearSVC(C=1)
    clf = LogisticRegressionCV(cv=3, scoring='roc_auc', solver='liblinear', Cs=[3, 4, 5, 6, 7])

    # Cross validation, this takes a long time ...
    # print "4 Fold CV Score: ", np.mean(cross_validation.cross_val_score(clf, train_data_features, train_sentiment, cv=4, scoring='accuracy', n_jobs=4))

    # Fit the svc to the training set, using the bag of words as
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    model = clf.fit(train_data_features, train_sentiment)

    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = []

    print "Cleaning and parsing the test set movie reviews...\n"
    for i in xrange(0, len(test)):
        clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], True)))

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    # Use svc to make sentiment label predictions
    print "Predicting test labels...\n"

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    if GO_FOR_REAL:
        result = model.predict_proba(test_data_features)[:, 1] # predict as probability
        output = pd.DataFrame(data={"id": test_id, "sentiment":result})
    else:
        result = model.predict(test_data_features)
        output = pd.DataFrame(data={"sentiment":test_sentiment, "predict_sentiment":result})
        output['succeed'] = output['sentiment'] == output['predict_sentiment']

        groupby = output.groupby('succeed')
        print 'Result Evaluation'
        print groupby['sentiment'].agg(['count'])

    # Use pandas to write the comma-separated output file
    output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=csv.QUOTE_MINIMAL)
    print "Wrote results to Bag_of_Words_model.csv"

    print datetime.now() - start_time
    print 'Cs_'
    print getattr(model, 'Cs_')
    print 'scores_'
    print getattr(model, 'scores_')
    print 'C_'
    print getattr(model,'C_')
# In[12]:

import pandas as pd

# Read data from files 
article = pd.read_csv( "train_trend_1.csv")
article_test = pd.read_csv( "test_trend_1.csv")


# In[ ]:

print "Parsing train reviews..."

opinions = []
for opinion in article['Articles']:
    opinions.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( opinion )))


# In[ ]:

print "Parsing test reviews..."

opinions_test = []
for opinion_test in article_test['Articles']:
    opinions_test.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( opinion_test )))


# In[ ]:

# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (article['Articles'].size, num_clusters),     dtype="float32" )
def clean_review_function(review):
    list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False)
    return ' '.join(list_of_words)
def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True ))
    return clean_reviews
Exemple #53
0
train_file = 'data/labeledTrainData.tsv' 
test_file = 'data/testData.tsv'
output_file = 'data/bow_predictions.csv'

#

train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 )
test = pd.read_csv( test_file, header = 0, delimiter = "\t", quoting = 3 )

#

print "Parsing train reviews..."

clean_train_reviews = []
for review in train['review']:
	clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))

print "Parsing test reviews..."

clean_test_reviews = []
for review in test['review']:
	clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))

#

print "Vectorizing train..."

vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), 
	sublinear_tf = True )
train_x = vectorizer.fit_transform( clean_train_reviews )
testing_documents  = [item[2] + ' '+item[3] for item in fetched_testing_data];

print("Predicting the labels of the test set...")
print("%d documents" % len(testing_documents))
print("%d categories" % len(testing_outputs))

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list

print("Cleaning and parsing the training set...\n")
for i in xrange( 0, len(training_documents)):
     print('Row '+str(i))
     clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(training_documents[i], True)))
     vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
     train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an
    # array
     train_data_features = train_data_features.toarray()
     test["review"].size, unlabeled_train["review"].size )



    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for review in train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    print "Parsing sentences from unlabeled set"
    for review in unlabeled_train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
Exemple #56
0
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    # making a connection to mongoDB
    client = MongoClient('localhost', 27017)
    db = client.cs336
    db.create_collection("unlabeled_review")

    print "Cleaning and parsing the training set movie reviews...\n"
   # for i in xrange( 0, len(train["review"])):
    for i in xrange(0, 500):
        #clean_train_revie0ws.append(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)
        #clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_worddict(train["review"][i], True)))
        print i
        clean_train_review = KaggleWord2VecUtility.review_to_worddict(train["review"][i], True)
        #pprint(clean_train_reviews)
        record = {}
        record["id"] = train["id"][i]
       # record["sentiment"] = train["sentiment"][i]
        record["review"] = clean_train_review
        
        #pprint(record)

        db.unlabeled_review.insert_one(record)
    
    print "Inserted all documents to the collection"

    #pprint(clean_train_reviews[0])

    # ****** Create a bag of words from the training set
Exemple #57
0
import numpy as np
import pickle


train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", \
               quoting=3 )
y = train["sentiment"]  
print "Cleaning and parsing movie reviews...\n"

# print "lentrain ",len(train["review"])

traindata = []
for i in xrange( 0, len(train["review"])):
    traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
    testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ', 
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
        name_list = ['sentiment', 'id', 'time', 'query', 'user', 'text']
        train = pd.read_csv(data_path + "training.1600000.processed.noemoticon.csv", \
                         header=None, names=name_list)

        test = pd.read_csv(data_path + "testdata.manual.2009.06.14.csv", \
                         header=None, names=name_list)

        # Initialize an empty list to hold the clean tweets
        clean_train_tweets = []

        # Loop over each review; create an index i that goes from 0 to the length
        # of the movie review list
        print "Cleaning and parsing the training set tweets...\n"
        for i in xrange(0, len(train["text"])):
            print (str(i))
            clean_train_tweets.append(KaggleWord2VecUtility.review_to_wordlist(train["text"][i], remove_stop_words))

        train_labels = train['sentiment'].tolist()

        # Initialize an empty list to hold the clean tweets
        clean_test_tweets = []

        # Loop over each review; create an index i that goes from 0 to the length
        # of the movie review list
        print "Cleaning and parsing the test set tweets...\n"
        for i in xrange(0, len(test["text"])):
            print (str(i))
            clean_test_tweets.append(KaggleWord2VecUtility.review_to_wordlist(test["text"][i], remove_stop_words))

        test_labels = test['sentiment'].tolist()