def biomarkerMediumRelations(filename): # Processing the input data and converting to sentences # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences pckl_f = "cache/" + filename + "/" + 'sentences.pkl' # try: # #Try to see if the sentences have been parsed # with open(pckl_f, 'rb') as f: # #Load em in if they have # sentences = cPickle.load(f) # except: sentences = doc_parser.parseDoc(filename) # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + _filename +"/" ): # os.makedirs("cache/" + _filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) medium_ngrams = Ngrams(n_max=3) # Create the two matchers who have been defined in separate classes BM = matchers.getBiomarkerMatcher() MM = matchers.getMediumMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_MM = CandidateExtractor(medium_ngrams, MM) # Create the relations using the two matchers possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_MM) return possiblePairs
def biomarkerDrugRelations(filename): # Processing the input data and converting to sentences sentences = doc_parser.parseDoc(filename) # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences # print sentences # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + filename +"/" ): # os.makedirs("cache/" + filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) drug_association_ngrams = Ngrams(n_max=5) BM = matchers.getBiomarkerMatcher() DAM = matchers.getDrugAssociationMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_DAM = CandidateExtractor(drug_association_ngrams, DAM) possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_DAM) return possiblePairs
def generateAllTupleRelations(filename): sentences = doc_parser.parseDoc(filename) possiblePairs_BM_L = LevelsRelation_BiomarkerLevels_Extraction.generateRelations( sentences) possiblePairs_BM_TS = LevelsRelation_BiomarkerTestSet_Extraction.generateRelations( sentences) possiblePairs_BM_MT = LevelsRelation_BiomarkerMeasurementType_Extraction.generateRelations( sentences) possiblePairs_BM_U = LevelsRelation_BiomarkerUnits_Extraction.generateRelations( sentences) all = [] # Generate all possible Multi-Relational Objects for pair_BM_L in possiblePairs_BM_L: for pair_BM_TS in possiblePairs_BM_TS: if pair_BM_L.mention1(attribute='sent_id') == pair_BM_TS.mention1( attribute='sent_id') and pair_BM_L.mention1( attribute='char_offsets') == pair_BM_TS.mention1( attribute='char_offsets'): for pair_BM_MT in possiblePairs_BM_MT: if pair_BM_L.mention1( attribute='sent_id') == pair_BM_MT.mention1( attribute='sent_id') and pair_BM_L.mention1( attribute='char_offsets' ) == pair_BM_MT.mention1( attribute='char_offsets'): for pair_BM_U in possiblePairs_BM_U: if pair_BM_L.mention1( attribute='sent_id') == pair_BM_U.mention1( attribute='sent_id' ) and pair_BM_L.mention1( attribute='char_offsets' ) == pair_BM_U.mention1( attribute='char_offsets'): multiRelation = [ pair_BM_L, pair_BM_TS, pair_BM_MT, pair_BM_U ] all.append(multiRelation) return all
def AllLevelsRelationTuples(filename): sentences = doc_parser.parseDoc(filename) possiblePairs_BM_U = biomarkerUnitsRelations(sentences) possiblePairs_BM_L = biomarkerLevelsRelations(sentences) possiblePairs_BM_MT = biomarkerMeasurementRelations(sentences) possiblePairs_BM_TS = biomarkerTestsetRelations(sentences) # highly innefficient...fix! all = [] for pair_BM_L in possiblePairs_BM_L: for pair_BM_TS in possiblePairs_BM_TS: if pair_BM_L.mention1(attribute='sent_id') == pair_BM_TS.mention1( attribute='sent_id') and pair_BM_L.mention1( attribute='char_offsets') == pair_BM_TS.mention1( attribute='char_offsets'): for pair_BM_MT in possiblePairs_BM_MT: if pair_BM_L.mention1( attribute='sent_id') == pair_BM_MT.mention1( attribute='sent_id') and pair_BM_L.mention1( attribute='char_offsets' ) == pair_BM_MT.mention1( attribute='char_offsets'): for pair_BM_U in possiblePairs_BM_U: if pair_BM_L.mention1( attribute='sent_id') == pair_BM_U.mention1( attribute='sent_id' ) and pair_BM_L.mention1( attribute='char_offsets' ) == pair_BM_U.mention1( attribute='char_offsets'): multiRelation = [ pair_BM_L, pair_BM_TS, pair_BM_MT, pair_BM_U ] all.append(multiRelation) return all
def biomarkerTypeRelations(filename): """ Processing the input data and converting to sentences """ # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences # pckl_f = "cache/" + filename + "/" + 'sentences.pkl' # try: # #Try to see if the sentences have been parsed # with open(pckl_f, 'rb') as f: # #Load em in if they have # sentences = cPickle.load(f) # except: sentences = doc_parser.parseDoc(filename) # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + filename +"/" ): # os.makedirs("cache/" + filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) biomarker_type_ngrams = Ngrams(n_max=2) BM = matchers.getBiomarkerMatcher() TM = matchers.getBiomarkerTypeMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_TM = CandidateExtractor(biomarker_type_ngrams, TM) possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_TM) return possiblePairs
def generateRelations(filename): sentences = doc_parser.parseDoc(filename) possiblePairs = LevelsRelation_BiomarkerLevels_Extraction.generateRelations( sentences) return possiblePairs
def generateRelations(_filename): # Processing the input data and converting to sentences # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences #pckl_f = _filename # try: # #Try to see if the sentences have been parsed # with open(pckl_f, 'rb') as f: # #Load em in if they have # sentences = pickle.load(f) # except: #doc_parser = TextDocParser(_filename) #sent_parser = SentenceParser() #corpus = Corpus(doc_parser, sent_parser) # print corpus # Sentences havent been parsed, so parse them now #sentences = corpus.get_contexts() sentences = doc_parser.parseDoc(_filename) print sentences # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + _filename +"/" ): # os.makedirs("cache/" + _filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) disease_ngrams = Ngrams(n_max=5) # Create the two matchers who have been defined in separate classes BM = BiomarkerMatcher.getBiomarkerMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) DM = DiseaseMatcher.getDiseaseMatcher() CandidateExtractor_DM = CandidateExtractor(disease_ngrams, DM) # #fix disease candidate generator error- "ovarian and prostate cancer" # with open('diseaseDatabase.pickle', 'rb') as f: # diseaseDictionary = pickle.load(f) # DiseaseMatch = DictionaryMatch(label = "Diseases", dictionary = diseaseDictionary, ignore_case= True) # E = Entities(sentences, DiseaseMatch) # filename = "AGR2_blood_biomarker.txt" # text = open(filename, "r").read() # editedText = DiseaseCandidateGenerator.addDiseaseBases(E, diseaseDictionary ,text) # editedSentences = [] # sentence_parser = SentenceParser() # list = sentence_parser.parse(editedText, 1) # for editsentence in list: # editedSentences.append(editsentence) # Create the relations using the two matchers possiblePairs = Relations( sentences, CandidateExtractor_BM, CandidateExtractor_DM) # badCount = 0 # goodCount = 0 # allGoodPairs = [] # for goodPair in possiblePairs: # allGoodPairs.append(goodPair.sent_id) # #Return these pairs # for sentence in sentences: # if sentence.sent_id in allGoodPairs: # print "GOOD PAIR: " # print sentence.words # print "\n" # goodCount += 1 # else: # print "BAD PAIR:" # print sentence.words # print "\n" # badCount += 1 # print goodCount # print badCount # print possiblePairs return possiblePairs