def identify_interaction(self,path_to_test_file): document_data=open(path_to_test_file,'rb') xml_data=parse(document_data) sentences = xml_data.getElementsByTagName("sentence") result={} for sentence in sentences: entity_collection={} sentence_attrs = dict(sentence.attributes.items()) sentence_text=sentence_attrs["text"] sentence_id=sentence_attrs["id"] entities = sentence.getElementsByTagName("entity") for entity in entities: entity_attrs = dict(entity.attributes.items()) id=entity_attrs["id"] text=(entity_attrs["text"]).lower() type=(entity_attrs["type"]).lower() entity_collection[id]={} entity_collection[id]={"text":text,"type":type} words=utils.tokenize_string_without_punctuations(sentence_text) bigrams=utils.generate_bigrams(words) trigrams=utils.generate_trigrams(words) result_found=0 for bigram in bigrams: if bigram in self.bigrams_based_on_score.keys(): result[sentence_id]={} result[sentence_id]=self.bigrams_based_on_score[bigram] result_found=1 print "bigram as a result of which classification is done:"+bigram break if result_found!=1: result[sentence_id]="false" #pprint.pprint(self.bigrams_based_on_score) document_data.close() return result
def parse_ddi_corpus(self,file): #print file document_data=open(file,'r') xml_data=parse(document_data) sentences = xml_data.getElementsByTagName("sentence") for sentence in sentences: self.total_sentence_count+=1 sentence_attrs = dict(sentence.attributes.items()) text=sentence_attrs["text"] sentence_id=sentence_attrs["id"] words=utils.tokenize_string_without_punctuations(text) entities = sentence.getElementsByTagName("entity") for entity in entities: entity_attrs = dict(entity.attributes.items()) id=entity_attrs["id"] text=(entity_attrs["text"]).lower() type=(entity_attrs["type"]).lower() self.entity_collection[id]={} self.entity_collection[id]={"text":text,"type":type} interacting_pairs = sentence.getElementsByTagName("pair") # removing drugs for ids in self.entity_collection: if self.entity_collection[ids]["text"] in words: words.remove(self.entity_collection[ids]["text"]) for pair in interacting_pairs: pair_attrs = dict(pair.attributes.items()) type="null" ddi="0" if "ddi" in pair_attrs.keys(): ddi=pair_attrs["ddi"] if ddi=="false": ddi="0" elif ddi=="true": ddi="1" if "type" in pair_attrs.keys(): type=pair_attrs["type"] #self.pairs_collection[pair_attrs["id"]]={} pair_id=pair_attrs["id"] self.pairs_collection[pair_id]=ddi entity_1=self.entity_collection[pair_attrs["e1"]]["text"] entity_2=self.entity_collection[pair_attrs["e2"]]["text"] if entity_1 in self.interaction_collection.keys(): if entity_2 not in self.interaction_collection[entity_1].keys(): self.interaction_collection[entity_1][entity_2]=ddi elif entity_2 in self.interaction_collection.keys(): if entity_1 not in self.interaction_collection[entity_2]: self.interaction_collection[entity_2][entity_1]=ddi else: self.interaction_collection[entity_2]={} self.interaction_collection[entity_2][entity_1]=ddi self.interaction_collection[entity_2][entity_1]=ddi document_data.close()
def parse_ddi_corpus(self,file): #print file document_data=open(file,'r') xml_data=parse(document_data) sentences = xml_data.getElementsByTagName("sentence") for sentence in sentences: # setting flag to zero at the beginning of sentence is_there_a_positive_interaction_for_sentence=0 is_there_a_negative_interaction_for_sentence=0 #counting the total sentences self.total_sentence_count+=1 sentence_attrs = dict(sentence.attributes.items()) text=sentence_attrs["text"] sentence_id=sentence_attrs["id"] words=utils.tokenize_string_without_punctuations(text) entities = sentence.getElementsByTagName("entity") for entity in entities: entity_attrs = dict(entity.attributes.items()) id=entity_attrs["id"] text=(entity_attrs["text"]).lower() type=(entity_attrs["type"]).lower() self.entity_collection[id]={} self.entity_collection[id]={"text":text,"type":type} interacting_pairs = sentence.getElementsByTagName("pair") # removing drugs for ids in self.entity_collection: if self.entity_collection[ids]["text"] in words: words.remove(self.entity_collection[ids]["text"]) for pair in interacting_pairs: pair_attrs = dict(pair.attributes.items()) type="null" if "type" in pair_attrs.keys(): type=pair_attrs["type"] if pair_attrs["ddi"]=="true": is_there_a_positive_interaction_for_sentence=1 entity_1=self.entity_collection[pair_attrs["e1"]]["text"] entity_2=self.entity_collection[pair_attrs["e2"]]["text"] if entity_1 in self.interaction_collection.keys(): if entity_2 not in self.interaction_collection[entity_1].keys(): self.interaction_collection[entity_1][entity_2]=type elif entity_2 in self.interaction_collection.keys(): if entity_1 not in self.interaction_collection[entity_2]: self.interaction_collection[entity_2][entity_1]=type else: self.interaction_collection[entity_2]={} self.interaction_collection[entity_2][entity_1]=type elif pair_attrs["ddi"]=="false" and is_there_a_positive_interaction_for_sentence!=1: is_there_a_negative_interaction_for_sentence=1 if is_there_a_positive_interaction_for_sentence ==1: self.sentence_interaction_information[sentence_id]={} self.sentence_interaction_information[sentence_id]="true" self.positive_sentence_count=self.positive_sentence_count+1 self.populate_unigram_hashes(words,self.interaction_true_word_count) self.update_bigram_hash(words,self.interaction_true_bigram_count) self.update_trigram_hash(words,self.interaction_true_trigram_count) elif is_there_a_negative_interaction_for_sentence==1: self.negative_sentence_count=self.negative_sentence_count+1 self.populate_unigram_hashes(words,self.interaction_false_word_count) self.update_bigram_hash(words,self.interaction_false_bigram_count) self.update_trigram_hash(words,self.interaction_false_trigram_count) self.sentence_interaction_information[sentence_id]={} self.sentence_interaction_information[sentence_id]="false" else: self.sentence_interaction_information[sentence_id]={} self.sentence_interaction_information[sentence_id]="false" document_data.close()