Example #1
0
def test_baseline(src, tar):
    prefix = '../dataset/Sentiment/'
    suffix = ['/positive.review', '/negative.review', '/unlabeled.review']
    src_pos, src_pos_label = data_helper.read_file(prefix + src + suffix[0])
    src_neg, src_neg_label = data_helper.read_file(prefix + src + suffix[1])
    tar_pos, tar_pos_label = data_helper.read_file(prefix + tar + suffix[0])
    tar_neg, tar_neg_label = data_helper.read_file(prefix + tar + suffix[1])

    transformer = TfidfTransformer()
    cv_src = CountVectorizer(min_df=20)
    # x_src = cv_src.fit_transform(src_pos + src_neg).toarray()
    x_src = transformer.fit_transform(cv_src.fit_transform(src_pos +
                                                           src_neg)).toarray()

    y_src = src_pos_label + src_neg_label

    lr = LogisticRegression(solver='lbfgs', C=C)
    lr.fit(x_src, y_src)

    # x_tar = cv_src.transform(tar_pos + tar_neg).toarray()
    x_tar = transformer.fit_transform(cv_src.transform(tar_pos +
                                                       tar_neg)).toarray()
    y_tar = tar_pos_label + tar_neg_label

    acc = lr.score(x_tar, y_tar)
    return acc
Example #2
0
 def _readData(self, filePath):
     """
     :param filePath:
     :return:
     """
     labels, s = data_helper.read_file(filePath)
     contents = []
     for i in range(len(labels)):
         content = self.cleanReview(s[i])
         contents.append(content)
     return labels, contents
def features_stub():
    datafile = "restaurant-training.data"

    raw_data = data_helper.read_file(os.path.join(DATA_DIR, datafile))
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    category_texts = {"positive": positive_texts, "negative": negative_texts}
    feature_set = "word_features"

    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feature_set)

    raise NotImplemented
    filename = "???"
    write_features_category(features_category_tuples, filename)
Example #4
0
def features_stub():
    datafile = "imdb-training.data"
    raw_data = data_helper.read_file(datafile)
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    category_texts = {"positive": positive_texts, "negative": negative_texts}
    # FEATURE_SETS = {"word_pos_features", "word_features", "word_pos_liwc_features", "word_pos_opinion_features"}

    feature_set = "word_pos_opinion_features"

    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feature_set)

    # raise NotImplemented
    filename = feature_set + "-testing-features.txt"
    write_features_category(features_category_tuples, filename)
Example #5
0
def build_features(data_file, feat_name, save_feats=None, binning=False):
    # read text data
    raw_data = data_helper.read_file(data_file)
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    category_texts = {"positive": positive_texts, "negative": negative_texts}

    # build features
    features_category_tuples, texts = get_features_category_tuples(
        category_texts, feat_name)

    # save features to file
    if save_feats is not None:
        write_features_category(features_category_tuples, save_feats)

    return features_category_tuples, texts
Example #6
0
def test(src, tar, pivot_num, pivot_min_times, dim, C):
    weight_path = './weight/' + src + '2' + tar + '_svd_' + str(dim) + '.npy'
    W = np.load(weight_path)

    pivot_path = './pivot/' + src + '2' + tar + '_pivot_' + str(
        pivot_num) + '_' + str(pivot_min_times)
    with open(pivot_path, 'rb') as f:
        pivot = pickle.load(f)

    prefix = '../dataset/Sentiment/'
    suffix = ['/positive.review', '/negative.review', '/unlabeled.review']
    src_pos, src_pos_label = data_helper.read_file(prefix + src + suffix[0])
    src_neg, src_neg_label = data_helper.read_file(prefix + src + suffix[1])
    src_unl, src_unl_label = data_helper.read_file(prefix + src + suffix[2])
    tar_pos, tar_pos_label = data_helper.read_file(prefix + tar + suffix[0])
    tar_neg, tar_neg_label = data_helper.read_file(prefix + tar + suffix[1])
    tar_unl, tar_unl_label = data_helper.read_file(prefix + tar + suffix[2])

    # transformer_src = TfidfTransformer()
    cv_src = CountVectorizer(min_df=20)
    x_src = cv_src.fit_transform(src_pos + src_neg).toarray()
    # x_src = transformer_src.fit_transform(cv_src.fit_transform(src_pos + src_neg)).toarray()

    cv_lab_unl = CountVectorizer(min_df=40)
    x_lab_unl = cv_lab_unl.fit_transform(src_unl + src_pos + src_neg +
                                         tar_unl).toarray()
    # transformer_lab_unl = TfidfTransformer()
    # x_lab_unl = transformer_lab_unl.fit_transform(cv_lab_unl.fit_transform(src_unl + src_pos + src_neg + tar_unl)).toarray()

    # x_src_transform = transformer_lab_unl.transform(cv_lab_unl.transform(src_pos + src_neg)).toarray()
    x_src_transform = cv_lab_unl.transform(src_pos + src_neg).toarray()
    x_src_transform = np.delete(x_src_transform, pivot, 1)
    x_src_transform = x_src_transform.dot(W)

    x_src = np.concatenate((x_src, x_src_transform), axis=1)
    y_src = src_pos_label + src_neg_label

    lr = LogisticRegression(solver='lbfgs', C=C)
    lr.fit(x_src, y_src)

    # x_tar_transform = transformer_lab_unl.transform(cv_lab_unl.transform(tar_pos + tar_neg)).toarray()
    x_tar_transform = cv_lab_unl.transform(tar_pos + tar_neg).toarray()
    x_tar_transform = np.delete(x_tar_transform, pivot, 1)
    x_tar_transform = x_tar_transform.dot(W)

    # x_tar = transformer_src.transform(cv_src.transform(tar_pos + tar_neg)).toarray()
    x_tar = cv_src.transform(tar_pos + tar_neg).toarray()
    x_tar = np.concatenate((x_tar, x_tar_transform), axis=1)
    y_tar = tar_pos_label + tar_neg_label

    acc = lr.score(x_tar, y_tar)
    return acc
Example #7
0
def features_stub():

    feature_sets = [
        "word_features", "word_pos_features", "word_pos_liwc_features",
        "word_pos_opinion_features"
    ]
    datasets = ["training", "development", "testing"]

    for dataset in datasets:
        for feature_set in feature_sets:
            datafile = "data/imdb-" + dataset + ".data"
            raw_data = data_helper.read_file(datafile)
            positive_texts, negative_texts = data_helper.get_reviews(raw_data)
            category_texts = {
                "positive": positive_texts,
                "negative": negative_texts
            }
            features_category_tuples, texts = get_features_category_tuples(
                category_texts, feature_set)
            filename = "best_features/" + feature_set + "-" + dataset + "-features.txt"
            write_features_category(features_category_tuples, filename)
def features_stub():
    datafiles = [
        "imdb-training.data", "imdb-testing.data", "imdb-development.data"
    ]
    featuresets = [
        "word_pos_features", "word_features", "word_pos_liwc_features",
        "word_pos_opinion_features"
    ]
    for datafile in datafiles:
        for feature_set in featuresets:
            raw_data = data_helper.read_file(datafile)
            positive_texts, negative_texts = data_helper.get_reviews(raw_data)

            category_texts = {
                "positive": positive_texts,
                "negative": negative_texts
            }

            features_category_tuples, texts = get_features_category_tuples(
                category_texts, feature_set)
            data_set = re.search(r'-[a-z]+', datafile).group()
            filename = f'{feature_set}{data_set}-features.txt'
            write_features_category(features_category_tuples, filename)
def features_stub(filename):
    # open restaurant-training.data
    # calls data_helper.py to put file in pos or neg category list
    # here is where I would call other files as well
    datafile = filename
    raw_data = data_helper.read_file(datafile)
    positive_texts, negative_texts = data_helper.get_reviews(raw_data)

    # category_texts creates
    #   { posive, [... all positive reviews ] , negative, [...all neg ...] }
    #
    #category_texts = {"positive": positive_texts, "negative": negative_texts}
    #feature_set = "word_features"
    positive_toks = []
    positive_pos_toks = []
    negative_toks = []
    negative_pos_toks = []
    print('begin tokenize')
    # get word and pos tokens not the most
    # efficient but easier to trace
    for documents in positive_texts:
        positive_toks += get_words(documents)
    for documents in negative_texts:
        negative_toks += get_words(documents)

    for documents in positive_texts:
        positive_pos_toks += get_pos(documents)
    for documents in negative_texts:
        negative_pos_toks += get_pos(documents)
    print('tokenizing compl')
    # get ngrams for positive and negative categories
    posi_word_ngram = {}
    posi_pos_ngram = {}
    neg_word_ngram = {}
    neg_pos_ngram = {}
    print('begin word ngram')
    #for tokens in positive_toks:
    #    posi_word_ngram.update( get_ngram_features( tokens ) )
    posi_word_ngram.update(get_ngram_features(positive_toks))
    print('all positive word ngram completed')
    print('begin negative word ngram')
    #for tokens in negative_toks:
    #    neg_word_ngram.update( get_ngram_features( tokens ) )
    neg_word_ngram.update(get_ngram_features(negative_toks))
    print('all negative word ngram completed')
    print('end word ngram')

    print('begin pos ngram')
    #for tokens in positive_toks:
    #    posi_pos_ngram.update( get_ngram_features( tokens ) )
    posi_pos_ngram.update(get_ngram_features(positive_pos_toks))
    print('all pos pos ngram completed')
    print('begin negative ngram')
    #for tokens in negative_toks:
    #    neg_pos_ngram.update( get_ngram_features( tokens ) )
    neg_pos_ngram.update(get_ngram_features(negative_pos_toks))
    print('all negative pos ngram completed')
    print('end pos ngram')
    print('begin liwc')
    # get LIWC features
    posi_liwc_feat = get_liwc_features(positive_toks)
    neg_liwc_feat = get_liwc_features(negative_toks)
    print('end liwc')
    print('begin file write')
    print(posi_liwc_feat)
    print(neg_liwc_feat)
    fwrite_feature_vectors(filename, posi_word_ngram, neg_word_ngram,
                           posi_pos_ngram, neg_pos_ngram, posi_liwc_feat,
                           neg_liwc_feat)