Example #1
0
def get_data(data_file):
    if "tsv" in data_file:
        positive_texts, negative_texts = data_helper.get_reviews(
            os.path.join(DATA_DIR, data_file))
        return positive_texts, negative_texts
    else:
        category_text = data_helper.get_reviews(
            os.path.join(DATA_DIR, data_file))
    return category_text
def build_features(data_file, feat_name, binning):

    # read text data
    category_texts = data_helper.get_reviews(os.path.join("./", data_file))

    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name, binning)

    return features_category_tuples, texts
Example #3
0
def features_stub(path=os.path.join(DATA_DIR, "train_examples.tsv"),
                  feature_set="word_features"):

    positive_texts, negative_texts = data_helper.get_reviews(path)

    category_texts = {"positive": positive_texts, "negative": negative_texts}

    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feature_set)
    #print(features_category_tuples)
    return features_category_tuples, texts
Example #4
0
def build_features(data_file, feat_name):
    # read text data
    positive_texts, negative_texts = data_helper.get_reviews(
        os.path.join(DATA_DIR, data_file))
    #print(data_helper.get_reviews(os.path.join(DATA_DIR, data_file))["positive"])
    category_texts = {"positive": positive_texts, "negative": negative_texts}
    #print(category_texts["positive"])
    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name)
    #print(features_category_tuples)
    return features_category_tuples, texts
def build_features(data_file, feat_name, save_feats=None):
    # read text data

    if data_file == "test.txt":
        test_texts = data_helper.get_reviews(os.path.join(DATA_DIR, data_file))
        category_texts = {"test data": test_texts}
    else:
        positive_texts, negative_texts = data_helper.get_reviews(
            os.path.join(DATA_DIR, data_file))

        category_texts = {
            "positive": positive_texts,
            "negative": negative_texts
        }

    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name)

    # save features to file
    if save_feats is not None:
        write_features_category(features_category_tuples, save_feats)

    return features_category_tuples, texts
def features_stub():
    datafile = "restaurant-training.data"

    raw_data = data_helper.read_file(os.path.join(DATA_DIR, datafile))
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    category_texts = {"positive": positive_texts, "negative": negative_texts}
    feature_set = "word_features"

    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feature_set)

    raise NotImplemented
    filename = "???"
    write_features_category(features_category_tuples, filename)
Example #7
0
def features_stub():
    datafile = "imdb-training.data"
    raw_data = data_helper.read_file(datafile)
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    category_texts = {"positive": positive_texts, "negative": negative_texts}
    # FEATURE_SETS = {"word_pos_features", "word_features", "word_pos_liwc_features", "word_pos_opinion_features"}

    feature_set = "word_pos_opinion_features"

    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feature_set)

    # raise NotImplemented
    filename = feature_set + "-testing-features.txt"
    write_features_category(features_category_tuples, filename)
def build_features(data_file, feat_name, save_feats=None, binning=False):
    # read text liwc
    positive_texts, negative_texts = data_helper.get_reviews(
        os.path.join(DATA_DIR, data_file))

    category_texts = {"positive": positive_texts, "negative": negative_texts}

    # build features
    features_category_tuples, texts = features.get_features_category_tuples(
        category_texts, feat_name, data_file)

    # save features to file
    if save_feats is not None:
        write_features_category(features_category_tuples, save_feats)

    return features_category_tuples, texts
Example #9
0
def build_features(data_file, feat_name, save_feats=None, binning=False):
    # read text data
    raw_data = data_helper.read_file(data_file)
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    category_texts = {"positive": positive_texts, "negative": negative_texts}

    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name)

    # save features to file
    if save_feats is not None:
        write_features_category(features_category_tuples, save_feats)

    return features_category_tuples, texts
def main(reviews, output):
    model = build_classifier("svm")
    model = train_word_embem_model(model)
    file = open(output, "w+")
    dev_data = "dev_examples.tsv"
    dev_feats, dev_text = get_we_feat(dev_data)
    #print(dev_feats)
    acc, cm = evaluate(model, dev_feats, dev_text)
    dir_list = os.listdir(path)
    for d in dir_list:
        file_list = os.listdir(path + d)
        for f in file_list:
            texts = data_helper.get_reviews(os.path.join(path, d, f))

            for text in texts:
                w2v_feat = features.get_word_embedding_features(text)
                file.write(model.classify(w2v_feat) + " " + text + "\n")
Example #11
0
def build_features(data_file, feat_name, binning, save_feats=None, test=False):

    # read text data
    positive_texts, negative_texts = data_helper.get_reviews(os.path.join(DATA_DIR, data_file))
    if test:
        category_texts = {"unknown": positive_texts, "unknown2": negative_texts}
    else:
        category_texts = {"positive": positive_texts, "negative": negative_texts}

    # build features
    features_category_tuples, texts = get_features_category_tuples(category_texts, feat_name, binning)

    # save features to file
    if save_feats is not None:
        write_features_category(features_category_tuples, save_feats, test)

    return features_category_tuples, texts
def predict(feat_set, eval_data, train_data, out):
    model = train_model(train_data, feat_set)
    texts = data_helper.get_reviews(os.path.join(DATA_DIR, eval_data))
    fout = open(out, "w+")
    for text in texts:
        words, tags = features.get_words_tags(text)
        feature_vectors = {}
        if feat_set == "word_features":
            feature_vectors.update(features.get_ngram_features(words))
        elif feat_set == "word_pos_features":
            feature_vectors.update(features.get_ngram_features(words))
            feature_vectors.update(features.get_pos_features(tags))
        elif feat_set == "word_pos_liwc_features":
            feature_vectors.update(features.get_ngram_features(words))
            feature_vectors.update(features.get_pos_features(tags))
            feature_vectors.update(features.get_liwc_features(words))
        fout.write(model.classify(feature_vectors) + " " + text + "\n")

    return
Example #13
0
def build_features(data_file, feat_name):
    # read text data
    positive_texts, negative_texts = data_helper.get_reviews(
        os.path.join(DATA_DIR, data_file))

    category_texts = {"positive": positive_texts, "negative": negative_texts}

    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name)

    #labelDict = {}
    #for item in features_category_tuples:
    #    for tup in item[0]:  #go through items in dict
    #        if tup == "BIGRAM_service_friendly" and item[0][tup] == 1:
    #            print(tup)
    #        labelDict[(tup, item[0][tup])] = item[1]
    #print(labelDict)
    return features_category_tuples, texts
Example #14
0
def build_features(data_file, feat_name):
    # read text data
    positive_texts, negative_texts = data_helper.get_reviews(
        os.path.join(DATA_DIR, data_file))

    category_texts = {"positive": positive_texts, "negative": negative_texts}

    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name)

    # save features to file
    datamap = {}
    # print("Feature: " + str(feat_name)+"_ Data: "+str(data_file))
    datamap["dev_examples.tsv"] = "development"
    datamap["train_examples.tsv"] = "training"
    write_features_category(features_category_tuples, str(
        feat_name)+"-"+datamap[str(data_file)])

    return features_category_tuples, texts
def train_eval(train_file, eval_file, review_file, feature_set, pred_file):

    # train the model
    split_name = "train"
    model = train_model(train_file, feature_set, split_name)

    # evaluate the model
    if model is None:
        model = get_classifier(classifier_fname)

    copy = sys.stdout
    sys.stdout = open(pred_file, 'a')
    print("Using " + feature_set)
    print(eval_file)
    features_data, texts = build_features(eval_file, feature_set)
    accuracy, cm = evaluate(model,
                            features_data,
                            texts,
                            data_set_name="eval-{}".format(feature_set))
    print("\nThe accuracy of {} is: {}".format(eval_file, accuracy))
    #print("Confusion Matrix:")
    #print(str(cm))

    if "test" in review_file:
        texts = data_helper.get_reviews(os.path.join(DATA_DIR, review_file))
        for text in texts:
            words, tags = features.get_words_tags(text)
            feature_vectors = {}
            feature_vectors.update(features.get_ngram_features(words))
            print(model.classify(feature_vectors) + " " + text + "\n")
    else:
        features_data, texts = build_features(review_file, feature_set)
        accuracy, cm = evaluate(model,
                                features_data,
                                texts,
                                data_set_name="eval-{}".format(feature_set))
        print(review_file)
        print("\nThe accuracy of {} is: {}".format(eval_file, accuracy))
        #print("Confusion Matrix:")
        #print(str(cm))
    sys.stdout = copy
Example #16
0
def features_stub():

    feature_sets = [
        "word_features", "word_pos_features", "word_pos_liwc_features",
        "word_pos_opinion_features"
    ]
    datasets = ["training", "development", "testing"]

    for dataset in datasets:
        for feature_set in feature_sets:
            datafile = "data/imdb-" + dataset + ".data"
            raw_data = data_helper.read_file(datafile)
            positive_texts, negative_texts = data_helper.get_reviews(raw_data)
            category_texts = {
                "positive": positive_texts,
                "negative": negative_texts
            }
            features_category_tuples, texts = get_features_category_tuples(
                category_texts, feature_set)
            filename = "best_features/" + feature_set + "-" + dataset + "-features.txt"
            write_features_category(features_category_tuples, filename)
def features_stub():
    datafiles = [
        "imdb-training.data", "imdb-testing.data", "imdb-development.data"
    ]
    featuresets = [
        "word_pos_features", "word_features", "word_pos_liwc_features",
        "word_pos_opinion_features"
    ]
    for datafile in datafiles:
        for feature_set in featuresets:
            raw_data = data_helper.read_file(datafile)
            positive_texts, negative_texts = data_helper.get_reviews(raw_data)

            category_texts = {
                "positive": positive_texts,
                "negative": negative_texts
            }

            features_category_tuples, texts = get_features_category_tuples(
                category_texts, feature_set)
            data_set = re.search(r'-[a-z]+', datafile).group()
            filename = f'{feature_set}{data_set}-features.txt'
            write_features_category(features_category_tuples, filename)
def write_features():
    feat_sets = [
        "word_features", "word_pos_features", "word_pos_liwc_features"
    ]
    data_sets = ["-training", "-development", "-testing"]
    end = "-features.txt"
    for data in data_sets:
        for feat in feat_sets:
            fout = open(feat + data + end, "w+")
            if data == "-training":
                file = "train_examples.tsv"
            if data == "-development":
                file = "dev_examples.tsv"
            if data == "-testing":
                file = "test.txt"
            positive_texts, negative_texts = data_helper.get_reviews(
                os.path.join(DATA_DIR, file))
            for text in positive_texts:
                words, tags = features.get_words_tags(text)
                fout.write("positive ")
                temp_vec = {}
                if feat == "word_features":
                    print(features.get_ngram_features(words).keys)

                elif feat == "word_pos_features":
                    print(features.get_ngram_features(words).keys)

                    print(features.get_pos_features(tags).keys)

                elif feat == "word_pos_liwc_features":
                    features.get_ngram_features(words)

                    features.get_pos_features(tags)
                    features.get_liwc_features(words)

    return
def build_w2vec(data_file):
    # read text data
    category_texts = data_helper.get_reviews(os.path.join("./", data_file))
    w2vec_feats = get_w2v(category_texts)
    return w2vec_feats
def features_stub(filename):
    # open restaurant-training.data
    # calls data_helper.py to put file in pos or neg category list
    # here is where I would call other files as well
    datafile = filename
    raw_data = data_helper.read_file(datafile)
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    # category_texts creates
    #   { posive, [... all positive reviews ] , negative, [...all neg ...] }
    #
    #category_texts = {"positive": positive_texts, "negative": negative_texts}
    #feature_set = "word_features"
    positive_toks = []
    positive_pos_toks = []
    negative_toks = []
    negative_pos_toks = []
    print('begin tokenize')
    # get word and pos tokens not the most
    # efficient but easier to trace
    for documents in positive_texts:
        positive_toks += get_words(documents)
    for documents in negative_texts:
        negative_toks += get_words(documents)

    for documents in positive_texts:
        positive_pos_toks += get_pos(documents)
    for documents in negative_texts:
        negative_pos_toks += get_pos(documents)
    print('tokenizing compl')
    # get ngrams for positive and negative categories
    posi_word_ngram = {}
    posi_pos_ngram = {}
    neg_word_ngram = {}
    neg_pos_ngram = {}
    print('begin word ngram')
    #for tokens in positive_toks:
    #    posi_word_ngram.update( get_ngram_features( tokens ) )
    posi_word_ngram.update(get_ngram_features(positive_toks))
    print('all positive word ngram completed')
    print('begin negative word ngram')
    #for tokens in negative_toks:
    #    neg_word_ngram.update( get_ngram_features( tokens ) )
    neg_word_ngram.update(get_ngram_features(negative_toks))
    print('all negative word ngram completed')
    print('end word ngram')

    print('begin pos ngram')
    #for tokens in positive_toks:
    #    posi_pos_ngram.update( get_ngram_features( tokens ) )
    posi_pos_ngram.update(get_ngram_features(positive_pos_toks))
    print('all pos pos ngram completed')
    print('begin negative ngram')
    #for tokens in negative_toks:
    #    neg_pos_ngram.update( get_ngram_features( tokens ) )
    neg_pos_ngram.update(get_ngram_features(negative_pos_toks))
    print('all negative pos ngram completed')
    print('end pos ngram')
    print('begin liwc')
    # get LIWC features
    posi_liwc_feat = get_liwc_features(positive_toks)
    neg_liwc_feat = get_liwc_features(negative_toks)
    print('end liwc')
    print('begin file write')
    print(posi_liwc_feat)
    print(neg_liwc_feat)
    fwrite_feature_vectors(filename, posi_word_ngram, neg_word_ngram,
                           posi_pos_ngram, neg_pos_ngram, posi_liwc_feat,
                           neg_liwc_feat)
Example #21
0
    return features_category_tuples, all_texts


def write_features_category(features_category_tuples, outfile_name):
    """
    Save the feature values to file.

    :param features_category_tuples:
    :param outfile_name:
    :return:
    """
    with open(outfile_name, "w", encoding="utf-8") as fout:
        for (features, category) in features_category_tuples:
            fout.write("{0:<10s}\t{1}\n".format(category, features))


if __name__ == "__main__":
    #file = open("./data/test.txt")
    #text = "this is not a love hate love hotdog. not a hotdog. i love sandwiches wing wong"
    #for text in file:
    #print(get_liwc_features(nltk.word_tokenize("this is not a love hate love hotdog. not a hotdog. i love sandwiches wing wong")))
    #words, tags = get_words_tags("this is not a love hate love hotdog. not a hotdog. i love sandwiches wing wong")
    #print(words)
    #print(get_word_pos_features(text))
    out = open("restaurant-competition-model-P1-predictions.txt", "w")
    #sys.stdout = out
    for review in data_helper.get_reviews("./data/test.txt"):
        print(get_word_features(review))
    sys.stdout = sys.__stdout__
    pass