コード例 #1
0
ファイル: cli.py プロジェクト: BeelineShopping/BeelineDB
    def matchUp(self, token, ingredientRow):
        """
        Returns our best guess of the match between the tags and the
        words from the display text.

        This problem is difficult for the following reasons:
            * not all the words in the display name have associated tags
            * the quantity field is stored as a number, but it appears
              as a string in the display name
            * the comment is often a compilation of different comments in
              the display name

        """
        ret = []

        # strip parens from the token, since they often appear in the
        # display_name, but are removed from the comment.
        token = utils.normalizeToken(token)
        decimalToken = self.parseNumbers(token)

        for key, val in ingredientRow.iteritems():
            if isinstance(val, basestring):

                for n, vt in enumerate(utils.tokenize(val)):
                    if utils.normalizeToken(vt) == token:
                        ret.append(key.upper())

            elif decimalToken is not None:
                try:
                    if val == decimalToken:
                        ret.append(key.upper())
                except:
                    pass

        return ret
コード例 #2
0
ファイル: cli.py プロジェクト: BeelineShopping/BeelineDB
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start: end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    print utils.joinLine([token] + features + [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
コード例 #3
0
    def matchUp(self, token, ingredientRow):
        """
        Returns our best guess of the match between the tags and the
        words from the display text.

        This problem is difficult for the following reasons:
            * not all the words in the display name have associated tags
            * the quantity field is stored as a number, but it appears
              as a string in the display name
            * the comment is often a compilation of different comments in
              the display name

        """
        ret = []

        # strip parens from the token, since they often appear in the
        # display_name, but are removed from the comment.
        token = utils.normalizeToken(token)
        decimalToken = self.parseNumbers(token)

        for key, val in ingredientRow.iteritems():
            if isinstance(val, basestring):

                for n, vt in enumerate(utils.tokenize(val)):
                    if utils.normalizeToken(vt) == token:
                        ret.append(key.upper())

            elif decimalToken is not None:
                try:
                    if val == decimalToken:
                        ret.append(key.upper())
                except:
                    pass

        return ret
コード例 #4
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start:end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del (row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row))
                                            for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i + 1, tokens)
                    print utils.joinLine([token] + features +
                                         [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
コード例 #5
0
#!/usr/bin/env python

import sys
import os
import re

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from training import utils

if len(sys.argv) < 2:
    sys.stderr.write('Usage: parse-ingredients.py FILENAME')
    sys.exit(1)

FILENAME = str(sys.argv[1])
tmpFile = FILENAME + ".tmp"

with open(FILENAME) as infile, open(tmpFile, 'w') as outfile:
    for line in infile:
        line_clean = re.sub('<[^<]+?>', '', line)
        tokens = utils.tokenize(line_clean)

        for i, token in enumerate(tokens):
            features = utils.getFeatures(token, i + 1, tokens)
            outfile.write(utils.joinLine([token] + features) + "\n")
        outfile.write("\n")

tmpFilePath = "../../tmp/model_file"
modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile))
os.system("rm %s" % tmpFile)
コード例 #6
0
#!/usr/bin/env python

import sys
import os
import re

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from training import utils

if len(sys.argv) < 2:
    sys.stderr.write('Usage: parse-ingredients.py FILENAME')
    sys.exit(1)

FILENAME = str(sys.argv[1])
tmpFile = FILENAME + ".tmp"

with open(FILENAME) as infile, open(tmpFile, 'w') as outfile:
    for line in infile:
        line_clean = re.sub('<[^<]+?>', '', line)
        tokens = utils.tokenize(line_clean)

        for i, token in enumerate(tokens):
            features = utils.getFeatures(token, i+1, tokens)
            outfile.write(utils.joinLine([token] + features) + "\n")
        outfile.write("\n")

tmpFilePath = "../../tmp/model_file"
modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile))
os.system("rm %s" % tmpFile)