def add_line_break(self, text): # because write to a pseudo file is much faster # here we use StringIO instead of string concatenation new_text = StringIO() # generate features for each word to predict if there is a line break after the word # split text by blank features = word_features(text, gold_standard=False).values for row in features: feature_string = " ".join([str(i) for i in row]) self.tagger.add(feature_string) # generate prediction self.tagger.parse() # change text according to prediction for idx in range(0, len(features)): new_text.write(features[idx][0]) # If the word has new line after it, add new line # else add whitespace after word if self.tagger.y2(idx) == "NL": new_text.write("\n\n") else: new_text.write(" ") # clear parsed words self.tagger.clear() # to join string afterwards is more efficient in python return new_text.getvalue()
def prepare_crf_data(dir, output_file, file_names=[]): ''' Parse data in .xml and .txt file to data that can be recognized by crf++ tool Export all newly adapted data to output_file ''' # Store transferred data, which is used for crf training total_features = DataFrame() # Store the begin train time begin_train = time.time() print("Data preparation begin:\n") if len(file_names) > 0: files = file_names else: files = os.listdir(dir) for file in files: with codecs.open(dir + file, "r", "ISO-8859-1") as f: string = f.read() if file.endswith(".xml"): text = xml_strip(string) assert len(text) < len(string) elif file.endswith(".txt"): text = string else: continue total_features = pd.concat([total_features, word_features(text)], axis=0) # print total training time end_train = time.time() - begin_train print("Done prepping training data in ", end_train, "seconds") # export training data total_features.to_csv(output_file, encoding='utf-8', sep="\t", header=False, index=False) print "Training features saved"