def convert_file(self, input_file, output_file): text = "" actual_word = "" new_para = False new_word = False for line in input_file: if ("</para>" in line): new_para = True elif ("<para>" in line and new_para): output_file.write('\n\n') new_para = False elif ("<w " in line): new_word = True elif ("</w>" in line and new_word): output_file.write(actual_word) actual_word = "" new_word = False elif ("<token>" in line): token = get_interstring(line, '>', '<') + " " actual_word += token elif ("<no_space_after>" in line): value = get_interstring(line, '>', '<') if (value == "1"): actual_word = actual_word[:-1] output_file.write(text)
def process_coref( self, first_line, actual_info, perspron): # -> Coreference_record """ reads coreferent ID and creates a new coreference record """ ( actual_ID, actual_type ) = actual_info coref_ID = "" if ( "<coref_gram" in first_line ): line = self.pdt_t_input.readline() coref_ID = get_interstring( line, '>', '<') elif ( "<coref_text" in first_line ): self.pdt_t_input.readline() # <LM> line = self.pdt_t_input.readline() coref_ID = get_interstring( line, '>', '<') #elif ( "<coref_special" in first_line ): # pass # segments or exophorae - they don't refere to any word in file coref_type = self.ID_type( coref_ID) # string "word", "dropped", "sentence", "other" #actual_ID = actual_info[0] #actual_dropped = actual_info[1] if ( coref_type == "dropped" ): # replace a reference to dropped pronoun with a reference to its non-dropped supernode coref_ID = self.get_dropped( coref_ID) if ( actual_ID != None and coref_ID != None ): record = Coreference_record( actual_type == "dropped", actual_ID, coref_type == "dropped", coref_ID, perspron) return record return None
def read_infos( self, first_line): # -> (string, Node_type) """ obtains information about a node from its first line return a pair of id string (substring of the first line containing node id) and enum """ id_string = get_interstring( first_line, '"', '"') # gots substring between first two quotes type = self.get_node_type( id_string) # recognizes node type return ( id_string, type)
def get_paragraph_ID( self, id_string): # -> int """ gets the number of the actual paragraph from the id of the actual node id_string ... of a sentence, e.g. t-lnd94103-052-p1s11 """ last = id_string.split( '-')[-1] id = get_interstring( last, 'p', 's') return int( id)
def next_pdt_word(self): pdt_line = self.pdt_w_input.readline() while (not "</doc>" in pdt_line): #print(self.para_ID, self.sent_ID ) #print(pdt_line) if ("<para" in pdt_line): self.para_ID += 1 self.sent_ID = 1 elif ("<w id" in pdt_line): pdt_ID = get_interstring(pdt_line, '"', '"') token_line = self.pdt_w_input.readline() token = get_interstring(token_line, '>', '<') return (pdt_ID, token) elif (pdt_line == ""): return ("", "") pdt_line = self.pdt_w_input.readline() return ("", "")
def process_coref( self, first_line, actual_info): # -> Coreference_record """ reads coreferent ID and creates a new coreference record """ ( actual_ID, actual_type ) = actual_info # id_string and Node_type (word or dropped) coref_ID = "" if ( "<coref_gram" in first_line ): line = self.pdt_t_input.readline() coref_ID = get_interstring( line, '>', '<') elif ( "<coref_text" in first_line ): self.pdt_t_input.readline() # <LM> line = self.pdt_t_input.readline() coref_ID = get_interstring( line, '>', '<') #elif ( "<coref_special" in first_line ): # pass # segments or exophorae - they don't refere to any word in file coref_type = self.get_node_type( coref_ID) # if ( coref_type == Node_type.Dropped ): # replace a reference to dropped pronoun with a reference to its non-dropped supernode coref_ID = self.get_dropped( coref_ID) if ( actual_ID != None and coref_ID != None ): # creating a new coreference record record = Coreference_record( actual_type == Node_type.Dropped, actual_ID, coref_type == Node_type.Dropped, coref_ID) return record
def paragraph_ID( self, string): last = string.split( '-')[-1] para_string = get_interstring( last, 'p', 's') return int( para_string)
def read_infos( self, string): id_string = get_interstring( string, '"', '"') type = self.ID_type( id_string) return ( id_string, type)