Example #1
0
def instances_and_lookups(input_file,
                          index_file,
                          sym_lookup={"unk": 0},
                          label_lookup={"unk": 0},
                          tag_type="attribute"):
    """
    Read communications and create integer encodings for them, along with lookups to recover the
    strings.  "unk" is mapped to 0 for both symbols and labels, to handle OOV at test time.  If
    symbol or label lookups are passed to the function, does *not* update the lookups and encodes
    unseen items as "unk".

    In other words, when reading training data, don't pass lookups.  When reading test data, pass in
    the lookups from the training data.
    """
    assert (sym_lookup["unk"] == 0 and label_lookup["unk"] == 0)
    update_sym = len(sym_lookup) == 1
    update_label = len(label_lookup) == 1
    cid_lookup = {}
    instances, labels = [], []
    unk_sym_occs, unk_sym_types = 0, set()
    unk_label_occs, unk_label_types = 0, set()
    for cid, label, text in read_data(options.input,
                                      index_file,
                                      tag_type=tag_type):
        if update_label:
            label_lookup[label] = label_lookup.get(label, len(label_lookup))
        cid_lookup[cid] = label_lookup.get(cid, len(cid_lookup))
        syms = []
        for c in text:
            if update_sym:
                sym_lookup[c] = sym_lookup.get(c, len(sym_lookup))
            syms.append(sym_lookup.get(c, 0))
            if syms[-1] == 0:
                unk_sym_types.add(c)
                unk_sym_occs += 1
        instances.append((label_lookup.get(label, 0), syms, cid_lookup[cid]))
        if instances[-1][0] == 0:
            unk_sym_types.add(label)
            unk_label_occs += 1
    logging.info("Loaded %d instances, %d labels", len(instances),
                 len(label_lookup))
    logging.info(
        "%d/%d unknown symbol occurrences/types, %d/%d unknown label occurences/types",
        unk_sym_occs,
        len(unk_sym_types),
        unk_label_occs,
        len(unk_label_types),
    )
    return instances, cid_lookup, sym_lookup, label_lookup
Example #2
0
    parser.add_argument("--train", dest="train")
    parser.add_argument("--test", dest="test")
    parser.add_argument("--model", dest="model")
    parser.add_argument("--output", dest="output")
    parser.add_argument("--max_ngram", dest="max_ngram", type=int, default=4)
    parser.add_argument("--batch_size",
                        dest="batch_size",
                        type=int,
                        default=None)
    options = parser.parse_args()

    # training
    if options.train and options.output and options.input:
        instances, labels = [], []
        for cid, label, text in read_data(options.input,
                                          options.train,
                                          tag_type=options.tag_type):
            instances.append(
                dict(
                    sum([
                        extract_character_ngrams(text, n)
                        for n in range(1, options.max_ngram + 1)
                    ], [])))
            labels.append(label)
        dv = DictVectorizer(sparse=True)
        X = dv.fit_transform(instances)
        fs = SelectKBest(k=options.kbest if options.kbest > 0 else X.shape[1])
        X = fs.fit_transform(X, labels)
        print(X.shape)
        label_lookup = {}
        classifier_class, args, hypers = models[options.model_type]