Ejemplo n.º 1
0
import lib
import sys
from optparse import OptionParser
from nltk import word_tokenize

optionParser = OptionParser()

options, args = optionParser.parse_args()

if len(args) == 0:
    raw = sys.stdin.read()
else:
    f = open(args[0])
    raw = f.read()

lines = lib.get_dat(raw)
"""
    Assume the input is in the format
        <Abstract text> <Count of keyword>  <Keyword 1> ... <Keyword n>
    Output
        <Token> <Tag (BIO)> (If Tag==B <Abstract number>   <Keyword number>)
"""

sys.stderr.write(str(len(lines)) + " entries\n")

for i in range(len(lines)):
    if i % 100 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = lines[i]
    abstract = line[0]
    keywords = line[2:]
Ejemplo n.º 2
0
    sys.stderr.write("Not lowercase\n")

if lemmatize:
    sys.stderr.write("Lemmatize\n")
    import nltk
    wnl = nltk.stem.WordNetLemmatizer()
else:
    sys.stderr.write("Not lemmatize\n")

if len(args) == 0:
    raw = sys.stdin.read()
else:
    f = open(args[0])
    raw = f.read()

lines = lib.get_dat(raw)

je_dict = dict([])

sys.stderr.write("Start making dict\n")

count = 0

sys.stderr.write("Total: " + str(len(lines)) + " entries\n")

for line in lines:
    if count%1000 == 0:
        sys.stderr.write(str(count) + " ")
    if count%10000 == 0:
        sys.stderr.write("\n")
    count += 1