Beispiel #1
0
def build_ngram_text(infile, outfile):
    fin = open(os.path.join(dnu.DATA_DIR, infile), 'rb')
    fout = open(os.path.join(dnu.DATA_DIR, outfile), 'wb')
    for line in fin:
        for word in line.strip().split():
            ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE)
            if len(ngrams) > 0:
                fout.write("%s\n" % " ".join(ngrams))
    fin.close()
    fout.close()
def build_ngram_text(infile, outfile):
    fin = open(os.path.join(dnu.DATA_DIR, infile), 'rb')
    fout = open(os.path.join(dnu.DATA_DIR, outfile), 'wb')
    for line in fin:
        for word in line.strip().split():
            ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE)
            if len(ngrams) > 0:
                fout.write("%s\n" % " ".join(ngrams))
    fin.close()
    fout.close()
from sklearn.externals import joblib
import drug_ner_utils as dnu
import os

generic_fd = set(dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, 
                                            "generic_fd.pkl")), 100))
brand_fd = set(dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, 
                                            "brand_fd.pkl")), 50))

fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
i = 0
for line in fraw:
    line = line.strip().lower()
    annotated = []
    for word in line.split():
        ngrams = set(dnu.str_to_ngrams(word, dnu.GRAM_SIZE))
        jc_generic = 1.0 * (len(ngrams.intersection(generic_fd)) / 
                            len(ngrams.union(generic_fd)))
        jc_brand = 1.0 * (len(ngrams.intersection(brand_fd)) / 
                          len(ngrams.union(brand_fd)))
        print word, jc_generic, jc_brand
        is_generic = jc_generic > 0.01
        is_brand = jc_brand > 0.01
        if is_generic:
            annotated.append("<GENERIC>%s</GENERIC>" % (word))
        elif is_brand:
            annotated.append("<BRAND>%s</BRAND>" % (word))
        else:
            annotated.append(word)
    print("Input: %s" % (line))
    print("Output: %s" % (" ".join(annotated)))
print("Score for generic classifier: %.3f" % (generic_clf.score(X, y)))

X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100)

y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl"))
brand_clf = LinearSVC()
brand_clf.fit(X, y)
print("Score for brand classifier: %.3f" % (brand_clf.score(X, y)))

fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
i = 0
for line in fraw:
    line = line.strip().lower()
    annotated = []
    for word in line.split():
        ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE)
        Xgen = generic_vec.transform([" ".join(ngrams)])
        Xbrand = brand_vec.transform([" ".join(ngrams)])
        is_generic = generic_clf.predict(Xgen)
        is_brand = brand_clf.predict(Xbrand)
        if is_generic == 1:
            annotated.append("<GENERIC>" + word + "</GENERIC>")
        elif is_brand == 1:
            annotated.append("<BRAND>" + word + "</BRAND>")
        else:
            annotated.append(word)
    print("Input: %s" % (line))
    print("Output: %s" % (" ".join(annotated)))
    i += 1
    if i > 10:
        break
Beispiel #5
0
print("Score for generic classifier: %.3f" % (generic_clf.score(X, y)))

X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100)

y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl"))
brand_clf = LinearSVC()
brand_clf.fit(X, y)
print("Score for brand classifier: %.3f" % (brand_clf.score(X, y)))

fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
i = 0
for line in fraw:
    line = line.strip().lower()
    annotated = []
    for word in line.split():
        ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE)
        Xgen = generic_vec.transform([" ".join(ngrams)])
        Xbrand = brand_vec.transform([" ".join(ngrams)])
        is_generic = generic_clf.predict(Xgen)
        is_brand = brand_clf.predict(Xbrand)
        if is_generic == 1:
            annotated.append("<GENERIC>" + word + "</GENERIC>")
        elif is_brand == 1:
            annotated.append("<BRAND>" + word + "</BRAND>")
        else:
            annotated.append(word)
    print("Input: %s" % (line))
    print("Output: %s" % (" ".join(annotated)))
    i += 1
    if i > 10:
        break
import os

generic_fd = set(
    dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, "generic_fd.pkl")),
                    100))
brand_fd = set(
    dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, "brand_fd.pkl")),
                    50))

fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
i = 0
for line in fraw:
    line = line.strip().lower()
    annotated = []
    for word in line.split():
        ngrams = set(dnu.str_to_ngrams(word, dnu.GRAM_SIZE))
        jc_generic = 1.0 * (len(ngrams.intersection(generic_fd)) /
                            len(ngrams.union(generic_fd)))
        jc_brand = 1.0 * (len(ngrams.intersection(brand_fd)) /
                          len(ngrams.union(brand_fd)))
        print word, jc_generic, jc_brand
        is_generic = jc_generic > 0.01
        is_brand = jc_brand > 0.01
        if is_generic:
            annotated.append("<GENERIC>%s</GENERIC>" % (word))
        elif is_brand:
            annotated.append("<BRAND>%s</BRAND>" % (word))
        else:
            annotated.append(word)
    print("Input: %s" % (line))
    print("Output: %s" % (" ".join(annotated)))