Esempio n. 1
0
    def add_line_break(self, text):

        # because write to a pseudo file is much faster
        # here we use StringIO instead of string concatenation
        new_text = StringIO()

        # generate features for each word to predict if there is a line break after the word
        # split text by blank
        features = word_features(text, gold_standard=False).values
        for row in features:
            feature_string = " ".join([str(i) for i in row])
            self.tagger.add(feature_string)

        # generate prediction
        self.tagger.parse()

        # change text according to prediction
        for idx in range(0, len(features)):
            new_text.write(features[idx][0])
            # If the word has new line after it, add new line
            # else add whitespace after word
            if self.tagger.y2(idx) == "NL":
                new_text.write("\n\n")
            else:
                new_text.write(" ")

        # clear parsed words
        self.tagger.clear()

        # to join string afterwards is more efficient in python
        return new_text.getvalue()
Esempio n. 2
0
def prepare_crf_data(dir, output_file, file_names=[]):
    '''
    Parse data in .xml and .txt file to data that can be recognized by crf++ tool
    Export all newly adapted data to output_file
    '''

    # Store transferred data, which is used for crf training
    total_features = DataFrame()

    # Store the begin train time
    begin_train = time.time()

    print("Data preparation begin:\n")

    if len(file_names) > 0:
        files = file_names
    else:
        files = os.listdir(dir)

    for file in files:
        with codecs.open(dir + file, "r", "ISO-8859-1") as f:
            string = f.read()
            if file.endswith(".xml"):
                text = xml_strip(string)
                assert len(text) < len(string)
            elif file.endswith(".txt"):
                text = string
            else: continue

            total_features = pd.concat([total_features, word_features(text)], axis=0)

    # print total training time
    end_train = time.time() - begin_train
    print("Done prepping training data in ", end_train, "seconds")

    # export training data
    total_features.to_csv(output_file, encoding='utf-8', sep="\t", header=False, index=False)
    print "Training features saved"