def create_entity_corpus(): data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora']) analyzer = EntityAnalyzer() docs = [] count = 1 max_count = 50000 for case in CaseReportLibrary(): text = case.get_text() # get symptom and disease entities tokens = analyzer.parse(text) docs.append(tokens) count += 1 if count % 100 == 0: print count, "/", max_count if count >= max_count: break dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] dictionary.save(os.path.join(data_folder, 'entity.dict')) corpora.MmCorpus.serialize(os.path.join(data_folder, 'entity.mm'), corpus)
def __init__(self, return_object=False): self.cases = CaseReportLibrary() self.labels = {} self.return_object = return_object with open("classification/data/class.txt", 'r') as infile: lines = infile.read().split("\n")[:-1] count = 0 for line in lines: pmcid, cls = line.split(" ") self.labels[pmcid] = cls
def __init__(self, size=50, modelfile=None): self.phrase_detector = PmiPhraseDetector( RawSentenceStream(fz_docs=False)) # build model epochs = 2 self.model = D2Vmodel( PhraseSentenceStream(self.phrase_detector, extract_func=extract_docid, fz_docs=True, reshuffles=epochs - 1), name="DOCID", dataset_name="CASEREPORT", epochs=epochs, dimension=size, modelfile=modelfile, ) self.doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
def create_corpus(): data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora']) docs = [] count = 1 max_count = 50000 for case in CaseReportLibrary(): # lower case all text (1) text = case.get_text() tokens = tokenize(text) docs.append(tokens) count += 1 if count % 100 == 0: print count, "/", max_count if count >= max_count: break dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] dictionary.save(os.path.join(data_folder, 'raw.dict')) corpora.MmCorpus.serialize(os.path.join(data_folder, 'raw.mm'), corpus)
phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False), filename=str("PHRASE_%s_2_CASEREPORT_RAW" % (pmi_level, ))) # build model epochs = 2 m = D2Vmodel(PhraseSentenceStream(phrase_detector, extract_func=extract_docid, fz_docs=False, reshuffles=epochs - 1), name="DOCID", dataset_name="CASEREPORT", epochs=epochs, dimension=40) doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT") """ sims = m.inner_model.most_similar(['DOCID-FZ4870'],topn=20) vec_lung_cancer = m.inner_model['DOCID-FZ4870'] vec_colitis = m.inner_model['DOCID-FZ1397'] vec_crohn1 = m.inner_model['DOCID-FZ20248'] vec_crohn2 = m.inner_model['DOCID-FZ18205'] vec_colorectal_cancer = m.inner_model['DOCID-FZ9693'] vec_diabetes = m.inner_model['DOCID-FZ14622'] vec_huntington = m.inner_model['DOCID-FZ142'] vec_alzheimers = m.inner_model['DOCID-FZ3951'] vec_early_alzheimers = m.inner_model['DOCID-FZ3953'] """ vec_query1 = m.infer_doc_vector(
__author__ = 'matias' from textanalysis.entityextractor import DiseaseExtractor, SymptomExtractor from textanalysis.texts import CaseReportLibrary from textanalysis.irdatastructs import InvertedIndex d_index = InvertedIndex("disease") s_index = InvertedIndex("symptom") cases = CaseReportLibrary() d_extractor = DiseaseExtractor() s_extractor = SymptomExtractor() count = 0 max_count = 50000 for case in cases: text = case.get_text() count += 1 symptoms = list(set(s_extractor.extract(text))) diseases = list(set(d_extractor.extract(text))) s_index.add(symptoms, count) d_index.add(diseases, count) if count >= max_count: break print count, "/", max_count print symptoms + diseases s_index.save() d_index.save()
def __init__(self): print "start" self.docids = [doc.get_id() for doc in CaseReportLibrary()] print "end"
__author__ = 'matias' import detectlanguage as dl from textanalysis.texts import CaseReportLibrary import pickle import os dl.configuration.api_key = "" def detect_english(text): result = dl.detect(text) return "en" in [e['language'] for e in result] if __name__ == "__main__": id2lang = {} for case in CaseReportLibrary(): abstract = case.get_abstract() pmcid = case.get_pmcid() is_english = detect_english(abstract) print is_english, abstract id2lang[pmcid] = is_english pickle.dump(id2lang, open("data/id2lang", 'w'))
__author__ = 'Matias' from textanalysis.texts import CaseReportLibrary class_labels = {} relabel = 0 with open('classification/data/class.txt', 'r') as infile: for line in infile.read().split("\n")[:-1]: pmcid, cls = line.split(" ") class_labels[pmcid] = cls if relabel: count = 0 for cs in CaseReportLibrary(): count += 1 pmcid = cs.get_pmcid() if pmcid in class_labels.keys() and class_labels[pmcid] == "1": print count, class_labels[pmcid], cs.title choice = raw_input( "Diagnosis(1), Test(2), Treatment(3), Drug reactions(4), Surgery adverse effect(5), Imaging(6) --- None(0) --- Quit(q)" ) class_labels[pmcid] = choice print ">>>" print "" if count > len(class_labels): break print len([ class_labels[pmcid] for pmcid in class_labels if class_labels[pmcid] == '1'
for dim in dims: # build model epochs = 2 m = D2Vmodel(PhraseSentenceStream(phrase_detector, extract_func=extract_docid, fz_docs=False, reshuffles=epochs - 1), name="DOCID", dataset_name="CASEREPORT", epochs=epochs, modelfile=str("DOC2VEC_CASEREPORT-PMI%s_DOCID_2" % (pmi, )), dimension=dim) doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT") count = 0 docid2vec = {} for case in CaseReportLibrary(): docid = case.get_id() if docid in m.inner_model.vocab: vec = m.inner_model[docid] docid2vec[docid] = vec count += 1 #print count, docid if count >= 20000: break count = 0 false_count = 0
if __name__ == "__main__": # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False)) filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Adv_Hematol_2010_Jun_16_2010_601548.nxml" filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Ann_Gastroenterol_2014_27(4)_418-420.nxml" filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Case_Rep_Ophthalmol_2011_Apr_22_2(1)_129-133.nxml" filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Case_Rep_Orthop_2011_Sep_26_2011_492407.nxml" # filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Adv_Urol_2010_May_10_2010_276497.nxml" # filename = r"C:\Users\matias\Desktop\thesis\data\casereports\Adv_Urol_2008_Oct_28_2008_173694.nxml" case_reports = CaseReportLibrary() # build model epochs = 10 m = D2Vmodel(PhraseSentenceStream(phrase_detector, extract_func=extract_docid, fz_docs=True, reshuffles=epochs - 1), name="DOCID", dataset_name="FINDZEBRA", epochs=epochs) fz_index = DocIndex(case_reports, "FINDZEBRA") cs_index = DocIndex(case_reports, "CASEREPORT") infer_index = InferredIndex(case_reports, "CASEREPORT", m, phrase_detector)