Beispiel #1
0
def load_sequences(filename, do_stem = False):
    """
    Returns (seqs, words, word_codes, coded_seqs) .`words` is never stemmed.
    All others use stemming if do_stem is true.
    """
    split_lines = [seq.split() for seq in fileinput.input(filename)]
    words = list({w for seq in split_lines for w in seq})
    if do_stem:
        seqs = ([stem(w) for w in seq if not isStopWord(w)] for seq in split_lines)
        stemmed_words = list({stem(w) for w in words if not isStopWord(w)})
    else:
        seqs = split_lines
        stemmed_words = words
    word_codes = {w: i for i,w in enumerate(stemmed_words)}
    coded_seqs = [np.array([word_codes[w] for w in seq]) for seq in seqs if len(seq) > 0]
    return words, word_codes, coded_seqs
Beispiel #2
0
def output_results(prefix, states, stemmed, iterations, seed, words, word_codes, hmm, i):
    filename ='{}_states-{}_stemmed-{}_iters-{}_seed-{}_{}.txt'.format(
            prefix, states, stemmed, iterations, seed, i)
    with open(filename , 'w') as out:
        for w in words:
            if not stemmed or not isStopWord(w):
                i = word_codes[stem(w)] if stemmed else word_codes[w]
                out.write(w + ' ' + ' '.join(str(x) for x in hmm.emit_probs[:, i]))
                out.write('\n')
    return filename
Beispiel #3
0
            help='Run HMM on stemmed words')
    parser.add_argument('-i', default=10000, type=int,
            help='Maximum number of iterations of EM to do')
    parser.add_argument('--seed', default=False, action='store_true',
            help='Emission probabilities are seeded with a modded co-occurrence matrix')
    parser.add_argument('--out', default=None, type=str, metavar='file prefix',
            help='Output intermittent data in files prefixed with this argument. By default, final results are printed to stdout.')

    args = parser.parse_args()

    log('Reading sequences')
    words, word_codes, coded_seqs = load_sequences(args.f, args.s)

    log('{} words, {} sequences, {} observables'.format(len(words), len(coded_seqs), len(word_codes)))
    log('Generating initial HMM')
    if args.seed:
        emit_probs = make_modded_cooccurrence(args.n, len(word_codes), coded_seqs)
        init_hmm = random_hmm(args.n, len(words), emit_probs = emit_probs)
    else:
        init_hmm = random_hmm(args.n, len(words))
    log('Running EM')
    out_func = lambda hmm, i : output_results(args.out, args.n, args.s, args.i, args.seed, words, word_codes, hmm, i) if args.out else None
    final_hmm = maximize_expectation(init_hmm, coded_seqs, max_iters = args.i, print_nll = True, out_func = out_func)
    if not args.out:
        log('Writing results')
        for w in words:
            if not args.s or not isStopWord(w):
                i = word_codes[stem(w)] if args.s else word_codes[w]
                print(w + ' ' + ' '.join(str(x) for x in final_hmm.emit_probs[:, i]))