Example #1
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start: end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    print utils.joinLine([token] + features + [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
Example #2
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start:end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del (row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row))
                                            for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i + 1, tokens)
                    print utils.joinLine([token] + features +
                                         [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
#!/usr/bin/env python

import sys
import os
import re

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from training import utils

if len(sys.argv) < 2:
    sys.stderr.write('Usage: parse-ingredients.py FILENAME')
    sys.exit(1)

FILENAME = str(sys.argv[1])
tmpFile = FILENAME + ".tmp"

with open(FILENAME) as infile, open(tmpFile, 'w') as outfile:
    for line in infile:
        line_clean = re.sub('<[^<]+?>', '', line)
        tokens = utils.tokenize(line_clean)

        for i, token in enumerate(tokens):
            features = utils.getFeatures(token, i + 1, tokens)
            outfile.write(utils.joinLine([token] + features) + "\n")
        outfile.write("\n")

tmpFilePath = "../../tmp/model_file"
modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile))
os.system("rm %s" % tmpFile)
#!/usr/bin/env python

import sys
import os
import re

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from training import utils

if len(sys.argv) < 2:
    sys.stderr.write('Usage: parse-ingredients.py FILENAME')
    sys.exit(1)

FILENAME = str(sys.argv[1])
tmpFile = FILENAME + ".tmp"

with open(FILENAME) as infile, open(tmpFile, 'w') as outfile:
    for line in infile:
        line_clean = re.sub('<[^<]+?>', '', line)
        tokens = utils.tokenize(line_clean)

        for i, token in enumerate(tokens):
            features = utils.getFeatures(token, i+1, tokens)
            outfile.write(utils.joinLine([token] + features) + "\n")
        outfile.write("\n")

tmpFilePath = "../../tmp/model_file"
modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile))
os.system("rm %s" % tmpFile)