def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ df = pd.read_csv(self.opts.data_path) df = df.fillna("") start = int(offset) end = int(offset) + int(count) df_slice = df.iloc[start: end] for index, row in df_slice.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del(row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i+1, tokens) print utils.joinLine([token] + features + [self.bestTag(tags)]) # ToDo: deal with this except UnicodeDecodeError: pass print
def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ df = pd.read_csv(self.opts.data_path) df = df.fillna("") start = int(offset) end = int(offset) + int(count) df_slice = df.iloc[start:end] for index, row in df_slice.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del (row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i + 1, tokens) print utils.joinLine([token] + features + [self.bestTag(tags)]) # ToDo: deal with this except UnicodeDecodeError: pass print
#!/usr/bin/env python import sys import os import re sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from training import utils if len(sys.argv) < 2: sys.stderr.write('Usage: parse-ingredients.py FILENAME') sys.exit(1) FILENAME = str(sys.argv[1]) tmpFile = FILENAME + ".tmp" with open(FILENAME) as infile, open(tmpFile, 'w') as outfile: for line in infile: line_clean = re.sub('<[^<]+?>', '', line) tokens = utils.tokenize(line_clean) for i, token in enumerate(tokens): features = utils.getFeatures(token, i + 1, tokens) outfile.write(utils.joinLine([token] + features) + "\n") outfile.write("\n") tmpFilePath = "../../tmp/model_file" modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath) os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile)) os.system("rm %s" % tmpFile)
#!/usr/bin/env python import sys import os import re sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from training import utils if len(sys.argv) < 2: sys.stderr.write('Usage: parse-ingredients.py FILENAME') sys.exit(1) FILENAME = str(sys.argv[1]) tmpFile = FILENAME + ".tmp" with open(FILENAME) as infile, open(tmpFile, 'w') as outfile: for line in infile: line_clean = re.sub('<[^<]+?>', '', line) tokens = utils.tokenize(line_clean) for i, token in enumerate(tokens): features = utils.getFeatures(token, i+1, tokens) outfile.write(utils.joinLine([token] + features) + "\n") outfile.write("\n") tmpFilePath = "../../tmp/model_file" modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath) os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile)) os.system("rm %s" % tmpFile)