import lib import sys from optparse import OptionParser from nltk import word_tokenize optionParser = OptionParser() options, args = optionParser.parse_args() if len(args) == 0: raw = sys.stdin.read() else: f = open(args[0]) raw = f.read() lines = lib.get_dat(raw) """ Assume the input is in the format <Abstract text> <Count of keyword> <Keyword 1> ... <Keyword n> Output <Token> <Tag (BIO)> (If Tag==B <Abstract number> <Keyword number>) """ sys.stderr.write(str(len(lines)) + " entries\n") for i in range(len(lines)): if i % 100 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = lines[i] abstract = line[0] keywords = line[2:]
sys.stderr.write("Not lowercase\n") if lemmatize: sys.stderr.write("Lemmatize\n") import nltk wnl = nltk.stem.WordNetLemmatizer() else: sys.stderr.write("Not lemmatize\n") if len(args) == 0: raw = sys.stdin.read() else: f = open(args[0]) raw = f.read() lines = lib.get_dat(raw) je_dict = dict([]) sys.stderr.write("Start making dict\n") count = 0 sys.stderr.write("Total: " + str(len(lines)) + " entries\n") for line in lines: if count%1000 == 0: sys.stderr.write(str(count) + " ") if count%10000 == 0: sys.stderr.write("\n") count += 1