コード例 #1
0
def create_entity_corpus():
    data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora'])

    analyzer = EntityAnalyzer()

    docs = []
    count = 1
    max_count = 50000
    for case in CaseReportLibrary():
        text = case.get_text()
        # get symptom and disease entities
        tokens = analyzer.parse(text)

        docs.append(tokens)
        count += 1
        if count % 100 == 0:
            print count, "/", max_count
        if count >= max_count:
            break

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    dictionary.save(os.path.join(data_folder, 'entity.dict'))
    corpora.MmCorpus.serialize(os.path.join(data_folder, 'entity.mm'), corpus)
コード例 #2
0
 def __init__(self, return_object=False):
     self.cases = CaseReportLibrary()
     self.labels = {}
     self.return_object = return_object
     with open("classification/data/class.txt", 'r') as infile:
         lines = infile.read().split("\n")[:-1]
         count = 0
         for line in lines:
             pmcid, cls = line.split(" ")
             self.labels[pmcid] = cls
コード例 #3
0
 def __init__(self, size=50, modelfile=None):
     self.phrase_detector = PmiPhraseDetector(
         RawSentenceStream(fz_docs=False))
     # build model
     epochs = 2
     self.model = D2Vmodel(
         PhraseSentenceStream(self.phrase_detector,
                              extract_func=extract_docid,
                              fz_docs=True,
                              reshuffles=epochs - 1),
         name="DOCID",
         dataset_name="CASEREPORT",
         epochs=epochs,
         dimension=size,
         modelfile=modelfile,
     )
     self.doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
コード例 #4
0
def create_corpus():
    data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora'])

    docs = []
    count = 1
    max_count = 50000
    for case in CaseReportLibrary():
        # lower case all text (1)
        text = case.get_text()
        tokens = tokenize(text)
        docs.append(tokens)
        count += 1
        if count % 100 == 0:
            print count, "/", max_count
        if count >= max_count:
            break

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    dictionary.save(os.path.join(data_folder, 'raw.dict'))
    corpora.MmCorpus.serialize(os.path.join(data_folder, 'raw.mm'), corpus)
コード例 #5
0
ファイル: build_doc2vec.py プロジェクト: waternk/medical-text
phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False),
                                    filename=str("PHRASE_%s_2_CASEREPORT_RAW" %
                                                 (pmi_level, )))

# build model
epochs = 2
m = D2Vmodel(PhraseSentenceStream(phrase_detector,
                                  extract_func=extract_docid,
                                  fz_docs=False,
                                  reshuffles=epochs - 1),
             name="DOCID",
             dataset_name="CASEREPORT",
             epochs=epochs,
             dimension=40)

doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
"""
sims = m.inner_model.most_similar(['DOCID-FZ4870'],topn=20)

vec_lung_cancer = m.inner_model['DOCID-FZ4870']
vec_colitis = m.inner_model['DOCID-FZ1397']
vec_crohn1 = m.inner_model['DOCID-FZ20248']
vec_crohn2 = m.inner_model['DOCID-FZ18205']
vec_colorectal_cancer = m.inner_model['DOCID-FZ9693']
vec_diabetes = m.inner_model['DOCID-FZ14622']
vec_huntington = m.inner_model['DOCID-FZ142']
vec_alzheimers = m.inner_model['DOCID-FZ3951']
vec_early_alzheimers = m.inner_model['DOCID-FZ3953']
"""

vec_query1 = m.infer_doc_vector(
コード例 #6
0
__author__ = 'matias'

from textanalysis.entityextractor import DiseaseExtractor, SymptomExtractor
from textanalysis.texts import CaseReportLibrary
from textanalysis.irdatastructs import InvertedIndex

d_index = InvertedIndex("disease")
s_index = InvertedIndex("symptom")

cases = CaseReportLibrary()
d_extractor = DiseaseExtractor()
s_extractor = SymptomExtractor()

count = 0
max_count = 50000
for case in cases:
    text = case.get_text()
    count += 1
    symptoms = list(set(s_extractor.extract(text)))
    diseases = list(set(d_extractor.extract(text)))
    s_index.add(symptoms, count)
    d_index.add(diseases, count)
    if count >= max_count:
        break
    print count, "/", max_count
    print symptoms + diseases

s_index.save()
d_index.save()
コード例 #7
0
 def __init__(self):
     print "start"
     self.docids = [doc.get_id() for doc in CaseReportLibrary()]
     print "end"
コード例 #8
0
ファイル: detectlang.py プロジェクト: waternk/medical-text
__author__ = 'matias'

import detectlanguage as dl
from textanalysis.texts import CaseReportLibrary
import pickle
import os

dl.configuration.api_key = ""


def detect_english(text):
    result = dl.detect(text)
    return "en" in [e['language'] for e in result]


if __name__ == "__main__":
    id2lang = {}

    for case in CaseReportLibrary():
        abstract = case.get_abstract()
        pmcid = case.get_pmcid()
        is_english = detect_english(abstract)
        print is_english, abstract
        id2lang[pmcid] = is_english

    pickle.dump(id2lang, open("data/id2lang", 'w'))
コード例 #9
0
__author__ = 'Matias'

from textanalysis.texts import CaseReportLibrary

class_labels = {}

relabel = 0

with open('classification/data/class.txt', 'r') as infile:
    for line in infile.read().split("\n")[:-1]:
        pmcid, cls = line.split(" ")
        class_labels[pmcid] = cls

if relabel:
    count = 0
    for cs in CaseReportLibrary():
        count += 1
        pmcid = cs.get_pmcid()
        if pmcid in class_labels.keys() and class_labels[pmcid] == "1":
            print count, class_labels[pmcid], cs.title
            choice = raw_input(
                "Diagnosis(1), Test(2), Treatment(3), Drug reactions(4), Surgery adverse effect(5), Imaging(6) --- None(0) --- Quit(q)"
            )
            class_labels[pmcid] = choice
            print ">>>"
            print ""
        if count > len(class_labels):
            break
    print len([
        class_labels[pmcid] for pmcid in class_labels
        if class_labels[pmcid] == '1'
コード例 #10
0
    for dim in dims:
        # build model
        epochs = 2
        m = D2Vmodel(PhraseSentenceStream(phrase_detector,
                                          extract_func=extract_docid,
                                          fz_docs=False,
                                          reshuffles=epochs - 1),
                     name="DOCID",
                     dataset_name="CASEREPORT",
                     epochs=epochs,
                     modelfile=str("DOC2VEC_CASEREPORT-PMI%s_DOCID_2" %
                                   (pmi, )),
                     dimension=dim)

        doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")

        count = 0
        docid2vec = {}
        for case in CaseReportLibrary():
            docid = case.get_id()
            if docid in m.inner_model.vocab:
                vec = m.inner_model[docid]
                docid2vec[docid] = vec
            count += 1
            #print count, docid
            if count >= 20000:
                break

        count = 0
        false_count = 0
コード例 #11
0
if __name__ == "__main__":
    # setup logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # phrase detector
    phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False))

    filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Adv_Hematol_2010_Jun_16_2010_601548.nxml"
    filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Ann_Gastroenterol_2014_27(4)_418-420.nxml"
    filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Case_Rep_Ophthalmol_2011_Apr_22_2(1)_129-133.nxml"
    filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Case_Rep_Orthop_2011_Sep_26_2011_492407.nxml"
    # filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Adv_Urol_2010_May_10_2010_276497.nxml"
    # filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Adv_Urol_2008_Oct_28_2008_173694.nxml"

    case_reports = CaseReportLibrary()

    # build model
    epochs = 10
    m = D2Vmodel(PhraseSentenceStream(phrase_detector,
                                      extract_func=extract_docid,
                                      fz_docs=True,
                                      reshuffles=epochs - 1),
                 name="DOCID",
                 dataset_name="FINDZEBRA",
                 epochs=epochs)

    fz_index = DocIndex(case_reports, "FINDZEBRA")
    cs_index = DocIndex(case_reports, "CASEREPORT")
    infer_index = InferredIndex(case_reports, "CASEREPORT", m, phrase_detector)