Python read_data Examples

Programming Language: Python

Namespace/Package Name: steamroller.tools.io

Method/Function: read_data

Examples at hotexamples.com: 2

Python read_data - 2 examples found. These are the top rated real world Python examples of steamroller.tools.io.read_data extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: cnn.py Project: hltcoe/steamroller

def instances_and_lookups(input_file,
                          index_file,
                          sym_lookup={"unk": 0},
                          label_lookup={"unk": 0},
                          tag_type="attribute"):
    """
    Read communications and create integer encodings for them, along with lookups to recover the
    strings.  "unk" is mapped to 0 for both symbols and labels, to handle OOV at test time.  If
    symbol or label lookups are passed to the function, does *not* update the lookups and encodes
    unseen items as "unk".

    In other words, when reading training data, don't pass lookups.  When reading test data, pass in
    the lookups from the training data.
    """
    assert (sym_lookup["unk"] == 0 and label_lookup["unk"] == 0)
    update_sym = len(sym_lookup) == 1
    update_label = len(label_lookup) == 1
    cid_lookup = {}
    instances, labels = [], []
    unk_sym_occs, unk_sym_types = 0, set()
    unk_label_occs, unk_label_types = 0, set()
    for cid, label, text in read_data(options.input,
                                      index_file,
                                      tag_type=tag_type):
        if update_label:
            label_lookup[label] = label_lookup.get(label, len(label_lookup))
        cid_lookup[cid] = label_lookup.get(cid, len(cid_lookup))
        syms = []
        for c in text:
            if update_sym:
                sym_lookup[c] = sym_lookup.get(c, len(sym_lookup))
            syms.append(sym_lookup.get(c, 0))
            if syms[-1] == 0:
                unk_sym_types.add(c)
                unk_sym_occs += 1
        instances.append((label_lookup.get(label, 0), syms, cid_lookup[cid]))
        if instances[-1][0] == 0:
            unk_sym_types.add(label)
            unk_label_occs += 1
    logging.info("Loaded %d instances, %d labels", len(instances),
                 len(label_lookup))
    logging.info(
        "%d/%d unknown symbol occurrences/types, %d/%d unknown label occurences/types",
        unk_sym_occs,
        len(unk_sym_types),
        unk_label_occs,
        len(unk_label_types),
    )
    return instances, cid_lookup, sym_lookup, label_lookup

Example #2

Show file

File: scikit_learn.py Project: hltcoe/steamroller

    parser.add_argument("--train", dest="train")
    parser.add_argument("--test", dest="test")
    parser.add_argument("--model", dest="model")
    parser.add_argument("--output", dest="output")
    parser.add_argument("--max_ngram", dest="max_ngram", type=int, default=4)
    parser.add_argument("--batch_size",
                        dest="batch_size",
                        type=int,
                        default=None)
    options = parser.parse_args()

    # training
    if options.train and options.output and options.input:
        instances, labels = [], []
        for cid, label, text in read_data(options.input,
                                          options.train,
                                          tag_type=options.tag_type):
            instances.append(
                dict(
                    sum([
                        extract_character_ngrams(text, n)
                        for n in range(1, options.max_ngram + 1)
                    ], [])))
            labels.append(label)
        dv = DictVectorizer(sparse=True)
        X = dv.fit_transform(instances)
        fs = SelectKBest(k=options.kbest if options.kbest > 0 else X.shape[1])
        X = fs.fit_transform(X, labels)
        print(X.shape)
        label_lookup = {}
        classifier_class, args, hypers = models[options.model_type]