class DrugnameMetamapExtractor(object): def __init__(self, rawTextFileName, intermediateXMLFileName): self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.Text = self.preprocess.rawText() def findEntity(self): # the server installed on your machine mm = MetaMap.get_instance('/work/tkakar/public_mm/bin/metamap14') #sample_Text = '/work/tkakar/FDAfirstNarrative.txt' rawText = self.Text #sents= self.Text concepts, error = mm.extract_concepts([rawText], word_sense_disambiguation=True) offset_list = [] drugs_list = [] drug_offset_pair = () for concept in concepts: c = concept.semtypes c = c.replace("[", "") c = c.replace("]", "") semTypes = c.strip().split(",") #print semTypes, type(semTypes) for semType in semTypes: if semType in ['phsu', 'orch']: token = concept.trigger.strip().split("-")[0] token = token.replace("[", "") #print concept.pos_info, "pos_info" offset = self.preprocess.offsetParse(concept.pos_info, ';') #print offset , "offset" , len(offset) for item in offset: #print item ,item[1] item[1] = item[0] + item[1] #print ("offsetMetamap" , item ) if item not in offset_list: offset_list.append(item) drugs_list.append(token) drugs_list = [drug.replace('"', "") for drug in drugs_list] #print len(drugs_list) elementList = [] for drug, offset in zip(drugs_list, offset_list): #print drug, type(drug), type(offset), [offset] elementList.append( DrugnameElement(drug, [offset], "DrugnameMetamapExtractor", "DRUGNAME")) #print len(elementList) return elementList
def __init__(self, rawTextFileName, intermediateXMLFileName): preprocessor = Preprocessor(rawTextFileName, intermediateXMLFileName) preprocessor.posTaggedText() preprocessor.getParseTree() preprocessor.getMetaMapConcepts() self.intermediate = intermediateXMLFileName self.vectors = []
class DrugnameRegExtractor(object): def __init__(self, rawTextFileName, intermediateXMLFileName): self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.Tokens = self.preprocess.wordTokenizeText() def findEntity(self): with open("/work/tkakar/git-repos/FDA-Textmining/Drugslist.txt" ) as myfile: drugnames = myfile.read().splitlines() Drug_list = [] for tokens in self.Tokens: for token in tokens: token = token.lower() ###tokens have some unicode u character which needs to be removed token = token.encode('utf-8') if token in [item.lower() for item in drugnames]: Drug_list.append(token) elementList = [] if not Drug_list: print("Drugname not found:") else: for i in range(0, len(Drug_list)): ##### THe offset cannot be same number or empty, so assigning random numbers #print Drug_list[i], "drug", [[i*i+10,i*i+25]] elementList.append( DrugnameElement(Drug_list[i], [[i * i + 10, i * i + 25]], "DrugnameRegExtractor", "DRUGNAME")) return elementList
def __init__(self, split_percent, dataset: Dataset, seed): super(IdentifierDataCreator, self).__init__(split_percent, dataset, seed) self._code_tokenizer = JavaCodeASTTokenizer() self._code_preprocessor = Preprocessor( [CamelCaseSplitter(True), NonLetterFilter()]) self._tracelink_type = TraceLinkType.identifier_tracelinks
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(MethodNameSentenceEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory)
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingOnlyMethods, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_class_name = False
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingWithAttribute, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_attribute = True
def __init__(self, all_req_files, all_code_files, split_percent, dataset: Dataset, seed): super(FakeSentenceDataCreator, self).__init__(split_percent, dataset, seed) self._req_preprocessor = Preprocessor([ Separator(True), CamelCaseSplitter(True), NonLetterFilter(), DuplicateWhiteSpaceFilter(), RemoveLastPunctuatioMark() ]) self._code_preprocessor = Preprocessor([ JavaDocFilter(), Separator(True), CamelCaseSplitter(True), NonLetterFilter(), DuplicateWhiteSpaceFilter(), RemoveLastPunctuatioMark() ]) self._tracelink_type = TraceLinkType.fake_sentence
def __init__(self, split_percent, dataset: Dataset, seed): self._req_tokenizer = SentenceTokenizer() self._req_preprocessor = Preprocessor([ Separator(True), CamelCaseSplitter(True), NonLetterFilter(), DuplicateWhiteSpaceFilter(), AddFullStop() ]) self._code_tokenizer = JavaCodeASTTokenizer(SentenceTokenizer()) self._code_preprocessor = Preprocessor([ JavaDocFilter(), Separator(True), CamelCaseSplitter(True), NonLetterFilter(), DuplicateWhiteSpaceFilter(), AddFullStop() ]) self._chosen_req_filenames = set( ) # contains req file names (without path and extension) for the training set self._remaining_req_filenames = set( ) # contains req file names (without path and extension) for the test set self._chosen_code_filenames = set( ) # contains code file names (without path and extension) for the training set self._remaining_code_filenames = set( ) # contains req file names (without path and extension) for the test set self._chosen_trace_matrix = SolutionTraceMatrix( ) # contains valid trace links between chosen code and req files for the training set self._remaining_trace_matrix = SolutionTraceMatrix( ) # contains valid trace links between remaining code and req files for the test set self._all_req_files = FileUtil.get_files_in_directory( dataset.req_folder()) # all req files of a project (e.g. etour) self._all_code_files = FileUtil.get_files_in_directory( dataset.code_folder()) # all code files of a project self._split_percent = split_percent # percentage of chosen file data self._dataset = dataset self._seed = seed self._solution_matrix = dataset.solution_matrix( ) # complete solution matrix of a project (e.g. etour) self._tracelink_type = None # Set this in non-abstract sub class constructors
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingCreatorWithMethodCommentToClass, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_class_name = True self._with_method = True self._with_method_comment_to_class = True self._with_class_name_to_method = True
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(CodeEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._is_ital_identifier = False self._is_ital_comm = False if isinstance(wordemb_creator, FastTextAlignedEngItalEmbeddingCreator): self._is_ital_comm = True
class NaiveEventDateExtractor(object): def __init__(self, rawTextFileName, intermediateXMLFileName): self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.text = self.preprocess.timexTagText() def findEntity(self): s = re.search(r'(<TIMEX2>)(.*?)(</TIMEX2>)', self.text) # print(tagged_raw[s.start():s.end()]) if s: print('Event date: {}'.format(s.group(2))) return EventDateElement(s.group(2), [[s.start(2), s.end(2)]], "NaiveEventDateExtractory", "EVENT_DT")
def __init__( self, precalculated_weights_file, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(TFIDFIdentifierEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) if not precalculated_weights_file: log.info("No precalculated weights file read") else: self._tf_idf_data = TFIDFData(precalculated_weights_file)
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingOnlyClassNameAndComment, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_class_comment = True self._with_method = False self._with_class_name_to_method = False self._with_attribute = False self._with_attribute_comment_to_attr = False self._with_attribute_comment_to_class = False
def __init__(self, rawTextFileName, intermediateXMLFileName, anExtractorList=[]): """ Initializes the EventDateAssembler and returns it. All Extractors for the Event Date DataElement must be specified in the list below. Args: anExtractorList (list): the list passed from the config file for EventDate Returns: EventDateAssembler Object """ self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.AllPossibleExtractorList = {} self.extractorList = anExtractorList self.extractorObjList = [] self.dataElementList = [] self.entityName = ""
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): self._with_class_name = True self._with_super_classifier = False self._with_class_comment = False self._with_attribute = False self._with_attribute_comment_to_attr = False self._with_attribute_comment_to_class = False self._with_method = True self._with_method_comment_to_method = False self._with_method_comment_to_class = False self._with_method_body_to_method = False self._with_method_body_to_class = False self._with_class_name_to_method = True self._with_inner_classifier = False self._average_function = Util.create_averaged_vector # function that maps multiple vectors to one super(IdentifierEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory)
# END OF IMPORTS # # Read in data trainingData = pd.read_csv('train.csv') testingData = pd.read_csv('test.csv') # Set display size for outputting data pd.set_option('display.max_columns', 100) pd.set_option('display.max_rows', 50) test_ID = testingData['Id'] # Preprocessing # Takes in the read data and spits out processed data preprocessor = Preprocessor(trainingData, testingData) dataObject = preprocessor.process(test_ID) trainingData = dataObject.trainingData testingData = dataObject.testingData combinedData = dataObject.combinedData # print(dataset['KitchenQual'].isnull().sum()) # trainingData.info() # print('_'*40) # testingData.info() # print() # print(trainingData.describe()) # print(testingData.describe()) # print()
def main(aRawTextFileName=None, aIntermediateXMLFileName=None, aConfigFile=None): assemblerList = [] if aRawTextFileName is None and aIntermediateXMLFileName is None: sysArgs = sys.argv[1:] if len(sysArgs) >= 3: """when calling ProjectAeris, it should be done with a raw text file and an output xml file location as the first and second arguments respectively""" rawTextFileName = sysArgs[0] intermediateXMLFileName = sysArgs[1] configFileName = sysArgs[2] else: print "Missing some command-line arguments" return else: rawTextFileName = aRawTextFileName intermediateXMLFileName = aIntermediateXMLFileName configFileName = aConfigFile print 'initial preprocess done!' preprocessOne = Preprocessor( rawTextFileName=rawTextFileName, intermediateXMLFileName=intermediateXMLFileName) configFile = configFileName allAssemblerDict = { 'Event Date': EventDateAssembler(rawTextFileName, intermediateXMLFileName), 'Age': AgeAssembler(rawTextFileName, intermediateXMLFileName), 'Dosage': DosageAssembler(rawTextFileName, intermediateXMLFileName), 'Drugname': DrugnameAssembler(rawTextFileName, intermediateXMLFileName), 'Weight': WeightAssembler(rawTextFileName, intermediateXMLFileName), 'Gender': GenderAssembler(rawTextFileName, intermediateXMLFileName), 'Reaction': ReactionAssembler(rawTextFileName, intermediateXMLFileName) } #Place to test new preprocess methods preprocessOne.getMetaMapConcepts() # preprocessOne.posTaggedText() # preprocessOne.getParseTree() # print preprocessOne.rawText() #Place to test new preprocess methods #The following is to actually run the extractors config = json.load(open(configFile)) entities = config.keys() for entity in entities: if entity not in allAssemblerDict: raise KeyError("An entity you entered doesn't exist") else: assemblerList.append((entity, allAssemblerDict[entity])) for name, assembler in assemblerList: if config[name]: assembler.setExtractorList(config[name]) assembler.runExtractors() assembler.writeToSemiFinalXML() assembler.launchTestSuite()
def __init__(self, rawTextFileName, intermediateXMLFileName): self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.Tokens = self.preprocess.wordTokenizeText()
def __init__(self, rawTextFileName, intermediateXMLFileName): self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.text = self.preprocess.timexTagText()
def __init__(self, rawTextFileName, intermediateXMLFileName): preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.Text = preprocess.rawText()
class AERecognitionEventDateExtractor(object): def __init__(self, rawTextFileName, intermediateXMLFileName): self.preprocess = Preprocessor(rawTextFileName, intermediateXMLFileName) self.tokens = self.preprocess.timexTagAndTokenizeText() # print self.tokens def findEntity(self): # print self.tokens #search for words (e.g. 'AE(s)' or 'Adverse Event(s)') pattern = r'\bAE(\s|s)' pattern2 = r'\bevents?\b' pattern3 = r'<\/?TIMEX2>' re_pat = re.compile(pattern, re.IGNORECASE) re_pat2 = re.compile(pattern2, re.IGNORECASE) re_pat3 = re.compile(pattern3, re.IGNORECASE) ae_index_list = [] #Go through and check for all adverse event/AE keyworks for index in range (0, len(self.tokens)): if (self.tokens[index].lower() == 'Adverse'.lower() and re_pat2.search(self.tokens[index + 1])) or (re_pat.search(self.tokens[index])): ae_index_list.append(index) if ae_index_list == []: print "There are no instances of keyword 'adverse event/AE'" return False #Get the indices for all the found tagged words time_index_list = [index for index in range(0,len(self.tokens)) if re_pat3.search(self.tokens[index])] if time_index_list == []: print "There are no temporal expressions in the text." return False #Minimize difference between indices for AE keywork and dates diff = min(product(ae_index_list, time_index_list), key = lambda t: abs(t[0] - t[1])) if re_pat3.search(self.tokens[diff[0]]): timexTuple = (diff[0],self.tokens[diff[0]]) aeTuple = (diff[1],self.tokens[diff[1]]) else: timexTuple = (diff[1],self.tokens[diff[1]]) aeTuple = (diff[0],self.tokens[diff[0]]) #Figure out of timex tuple is <TIMEX2> or </TIMEX2> and act accordingly if timexTuple[1].lower() == '<TIMEX2>'.lower(): date = list(itertools.takewhile(lambda x: x.lower() != '</TIMEX2>'.lower(), self.tokens[timexTuple[0] + 1:])) else: reversedList = self.tokens[::-1] date = list(itertools.takewhile(lambda x: x.lower() != '<TIMEX2>'.lower(), reversedList[(len(self.tokens) - timexTuple[0]):])) date = date[::-1] count = 0 #the rest is to find the offset for idx, token in enumerate(self.tokens): if idx > timexTuple[0]: break if token.lower() == '<TIMEX2>'.lower() or token.lower() == '</TIMEX2>'.lower(): count += 1 # add one because tokens index starts at 0 loc = timexTuple[0] + 1 - count print "this is the timexTuple: ", timexTuple[0] print "this is the loc: ", loc root = self.preprocess.getRoot() offsets = [] for x in range(0,len(date)): elem = root.find(".//*[@globalID='"+str(loc + x)+r"']") if elem is not None: offsets.append(elem.attrib['offset']) print 'AERecognitionEventDateExtractor: ', " ".join(date) offsetList = self.preprocess.offsetParse(";".join(offsets), delimiter=";") print " ".join(date), offsetList # if not offsetList: return EventDateElement(" ".join(date),[[]], "AERecognitionEventDateExtractor", 'EVENT_DT') else: return EventDateElement(" ".join(date), offsetList, "AERecognitionEventDateExtractor", 'EVENT_DT')