#!/usr/bin/env python import sys, argparse import ngram PROGRESS = 500000 WORD = ngram.load_table('word') WORD_AI = max(WORD.itervalues()) if len(WORD) > 0 else 0 print >>sys.stderr, "Loaded %d words. Starting at word id %d" \ % (len(WORD), WORD_AI) def word_id(word, outfile): global WORD, WORD_AI word = word[:45] v = WORD.get(word, None) if v is None: WORD_AI += 1 v = WORD_AI print >>outfile, '%d\t%s' % (v, word) return v POS = ngram.load_table('pos') POS_AI = max(POS.itervalues()) if len(POS) > 0 else 0 print >>sys.stderr, "Loaded %d POS. Starting at pos id %d" \ % (len(POS), POS_AI) NGRAM_POS = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'NUM', 'CONJ', 'PRT', 'X', '.']) def pos_id(tag, outfile): global POS, POS_AI, NGRAM_POS if tag not in NGRAM_POS: raise ValueError("Not a POS tag")
#!/usr/bin/env python import sys, argparse import ngram PROGRESS = 1000000 WORD = ngram.load_table('word') WORD_AI = max(WORD.values()) if len(WORD) > 0 else 0 print("Loaded %d words. Starting at word id %d" \ % (len(WORD), WORD_AI), file=sys.stderr) def word_id(word, outfile): global WORD, WORD_AI word = word[:45] v = WORD.get(word, None) if v is None: WORD_AI += 1 v = WORD_AI WORD[word] = v print('%d\t%s' % (v, ngram.db.escape_string(word)), file=outfile) return v def cached_lookup(key, cache, outfile): v = cache.get(key, None) if v is None: v = max(cache.values()) + 1 print('%d\t%s' % (v, ngram.db.escape_string(key)), file=outfile) cache[key] = v
#!/usr/bin/env python import sys, argparse import ngram PROGRESS = 100000 WORD = ngram.load_table('word') WORD_AI = max(WORD.values()) if len(WORD) > 0 else 0 print("Loaded %d words. Starting at word id %d" \ % (len(WORD), WORD_AI), file=sys.stderr) def word_id(word, outfile): global WORD, WORD_AI word = word[:45] v = WORD.get(word, None) if v is None: WORD_AI += 1 v = WORD_AI WORD[word] = v print('%d\t%s' % (v, ngram.db.escape_string(word)), file=outfile) return v POS = ngram.load_table('pos') POS_AI = max(POS.values()) if len(POS) > 0 else 0 print("Loaded %d POS. Starting at pos id %d" \ % (len(POS), POS_AI), file=sys.stderr) NGRAM_POS = [ 'NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'NUM', 'CONJ', 'PRT',
#!/usr/bin/env python import sys, argparse import ngram PROGRESS = 1000000 WORD = ngram.load_table('word') WORD_AI = max(WORD.itervalues()) if len(WORD) > 0 else 0 print >>sys.stderr, "Loaded %d words. Starting at word id %d" \ % (len(WORD), WORD_AI) def word_id(word, outfile): global WORD, WORD_AI word = word[:45] v = WORD.get(word, None) if v is None: WORD_AI += 1 v = WORD_AI WORD[word] = v print >>outfile, '%d\t%s' % (v, ngram.db.escape_string(word)) return v def cached_lookup(key, cache, outfile): v = cache.get(key, None) if v is None: v = max(cache.itervalues()) + 1 print >>outfile, '%d\t%s' % (v, ngram.db.escape_string(key)) cache[key] = v return v DEP = ngram.load_table('dep')