Example #1
0
def run(args):
    data = list(vectorize_sentences(chain(*(read_json_lines(fn) for fn in args.input))))
    X, y = zip(*data)
    cfg = CONFIG['train']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=cfg['test_size'], random_state=cfg['random_state'])
    pickle.dump((X_train, y_train), args.output)
    pickle.dump((X_test, y_test), args.output)
Example #2
0
def run(args):
    enum = enumerator()
    data = list(vectorize_sentences(enum, chain(*(read_json_lines(fn) for fn in args.input))))
    X, y = zip(*data)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    pickle.dump((X_train, y_train), args.output)
    pickle.dump((X_test, y_test), args.output)
def do_reducer(args):
    import pandas as pd
    obj = ndjson2col(read_json_lines(args.input))
    df = pd.DataFrame.from_dict(obj)
    csv_path = os.path.join(args.output, "summary.csv")
    logging.info("Writing brief summary to %s", csv_path)
    df.to_csv(csv_path)
    create_plots(args, df)
Example #4
0
def do_reducer(args):
    import pandas as pd

    obj = ndjson2col(read_json_lines(args.input))
    df = pd.DataFrame.from_dict(obj)
    csv_path = os.path.join(args.output, "summary.csv")
    logging.info("Writing brief summary to %s", csv_path)
    df.to_csv(csv_path)
    create_plots(args, df)
Example #5
0
def sample_by_y(args):
    sample = chain.from_iterable(read_json_lines(x) for x in args.sentences)
    cfg = CONFIG['train']
    label_counts = cfg.get('sample_labeled')
    if label_counts:
        sample = reservoir_dict(sample, "Y", label_counts,
                                random_state=cfg['random_state'])
    sentences, yvals = zip(*[(obj['X'], obj['Y']) for obj in sample])
    y_labels = np.array(yvals, dtype=float)
    return sentences, y_labels
Example #6
0
def do_reducer(args):
    import pandas as pd
    obj = ndjson2col(read_json_lines(args.input))
    df = pd.DataFrame.from_dict(obj)
    subset = get_df_subset(
        df, [args.group_by, args.x_axis, args.trial] + args.metrics)
    csv_path = os.path.join(args.output, "summary.csv")
    logging.info("Writing brief summary to %s", csv_path)
    subset.to_csv(csv_path)
    create_plots(args, subset, args.metrics)
Example #7
0
def run(args):
    enum = enumerator()
    data = list(
        vectorize_sentences(enum,
                            chain(*(read_json_lines(fn)
                                    for fn in args.input))))
    X, y = zip(*data)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    pickle.dump((X_train, y_train), args.output)
    pickle.dump((X_test, y_test), args.output)
Example #8
0
def get_data(args):

    feature_set_names = CONFIG['train']['features']
    if set(feature_set_names).intersection(['word2vec', 'doc2vec'
                                            ]) and not args.embedding:
        raise RuntimeError("--embedding argument must be supplied")

    # get Y labels
    training_set = read_tsv(args.train)
    y_labels = training_set["sentiment"]

    sentences = [obj['review'] for obj in read_json_lines(args.sentences)]

    if not args.embedding or feature_set_names == ['bow']:
        # don't drop NaNs -- have a sparse matrix here
        return False, (get_bow_features(sentences), y_labels)

    # load embedding
    if CONFIG['pretrain']['algorithm'] == 'word2vec':
        embedding = word2vec.Word2Vec.load(args.embedding)
    elif CONFIG['pretrain']['algorithm'] == 'glove':
        embedding = Glove.load(args.embedding)
        # dynamicaly add GloveWrapper mixin
        embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {})

    # get feature vectors
    if 'doc2vec' in CONFIG['train']['features']:
        embedding_vectors = get_doc2vec_features(sentences, embedding)
    elif 'word2vec' in CONFIG['train']['features']:
        embedding_vectors = get_word2vec_features(sentences, embedding)
    else:
        raise RuntimeError("Invalid config setting train:features=%s" %
                           CONFIG['train']['features'])

    if 'bow' in feature_set_names:
        return True, get_mixed_features(sentences, embedding_vectors, y_labels)
    else:
        # matrix is dense -- drop NaNs
        return False, drop_nans(embedding_vectors, y_labels)
Example #9
0
def get_data(args):

    feature_set_names = CONFIG['train']['features']
    if set(feature_set_names).intersection(['word2vec', 'doc2vec']) and not args.embedding:
        raise RuntimeError("--embedding argument must be supplied")

    # get Y labels
    training_set = read_tsv(args.train)
    y_labels = training_set["sentiment"]

    sentences = [obj['review'] for obj in read_json_lines(args.sentences)]

    if not args.embedding or feature_set_names == ['bow']:
        # don't drop NaNs -- have a sparse matrix here
        return False, (get_bow_features(sentences), y_labels)

    # load embedding
    if CONFIG['pretrain']['algorithm'] == 'word2vec':
        embedding = word2vec.Word2Vec.load(args.embedding)
    elif CONFIG['pretrain']['algorithm'] == 'glove':
        embedding = Glove.load(args.embedding)
        # dynamicaly add GloveWrapper mixin
        embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {})

    # get feature vectors
    if 'doc2vec' in CONFIG['train']['features']:
        embedding_vectors = get_doc2vec_features(sentences, embedding)
    elif 'word2vec' in CONFIG['train']['features']:
        embedding_vectors = get_word2vec_features(sentences, embedding)
    else:
        raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features'])

    if 'bow' in feature_set_names:
        return True, get_mixed_features(sentences, embedding_vectors, y_labels)
    else:
        # matrix is dense -- drop NaNs
        return False, drop_nans(embedding_vectors, y_labels)
Example #10
0
def json_field_iter(files, field=None):
    for fname in files:
        for doc in read_json_lines(fname):
            yield doc if field is None else doc[field]
Example #11
0
def doc_iter(args):
    field = args.field
    for fname in args.input:
        for doc in read_json_lines(fname):
            yield doc[field]