def translate_row(row): """Translates a row of labeled data into CRF++-compatible tag strings. Args: row: A row of data from the input CSV of labeled ingredient data. Returns: The row of input converted to CRF++-compatible tags, e.g. 2\tI1\tL4\tNoCAP\tNoPAREN\tB-QTY cups\tI2\tL4\tNoCAP\tNoPAREN\tB-UNIT flour\tI3\tL4\tNoCAP\tNoPAREN\tB-NAME """ # extract the display name display_input = utils.cleanUnicodeFractions(row['input']) tokens = tokenizer.tokenize(display_input) labels = _row_to_labels(row) label_data = _addPrefixes([(t, _matchUp(t, labels)) for t in tokens]) translated = '' for i, (token, tags) in enumerate(label_data): features = utils.getFeatures(token, i + 1, tokens) translated += utils.joinLine([token] + features + [_bestTag(tags)]) + '\n' return translated
def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ df = pd.read_csv("nyt-ingredients-snapshot-2015.csv") df = df.fillna("") start = int(offset) end = int(offset) + int(count) df_slice = df.iloc[start:end] s = "" for index, row in df_slice.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del (row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i + 1, tokens) s = s + utils.joinLine([token] + features + [self.bestTag(tags)]) + '\n' # ToDo: deal with this except UnicodeDecodeError: pass print self.writeTempFile(s)
def predictIngredientTag(ingredient): display_input = utils.cleanUnicodeFractions(ingredient) tokens = utils.tokenizeWithoutPunctuation(display_input) word2idx = {w: i for i, w in enumerate(words)} X = [[word2idx[w[0]] for w in s] for s in arr] max_len = max([len(x) for x in X]) x_testData = pad_sequences(sequences=[[word2idx.get(w, 0) for w in tokens]], padding="post", value=0, maxlen=max_len) loadedModel = loadTrainedModel() p = loadedModel.predict(np.array([x_testData[0]])) p = np.argmax(p, axis=-1) retArr = [] for w, pred in zip(tokens, p[0]): print("{:15}: {:5}".format(w, tags[pred])) retArr.append((w, tags[pred])) return retArr #print(predictIngredientTag("1 tomato")) #trainAndSaveModel()
def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ df = pd.read_csv(self.opts.data_path) df = df.fillna("") start = int(offset) end = int(offset) + int(count) df_slice = df.iloc[start: end] for index, row in df_slice.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del(row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i+1, tokens) print utils.joinLine([token] + features + [self.bestTag(tags)]) # ToDo: deal with this except UnicodeDecodeError: pass print
def readIngredientDataForExtract(): df = pd.read_csv("nyt-ingredients-snapshot-2015.csv") df = df.fillna("fillna") retArr = [] for index, row in df.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["name"]) display_input = str(re.sub(r'[^\w\s]', '', display_input)).lower() retArr.append(display_input) # ToDo: deal with this except UnicodeDecodeError: pass retArr2 = [] if len(retArr) > 0: for word in retArr: length = len([w for w in retArr if w == word]) if (word, length) not in retArr2: retArr2.append((word, length)) sorted(retArr2, key=lambda x: x[1]) if len(retArr2) > 0: with open('ingredients.txt', 'a') as the_file: for (word, count) in retArr2: the_file.write(word + " " + str(count) + '\n') the_file.close()
def _generate_data_worker(self, args): index, row = args out = [] try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del(row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i+1, tokens) out.append(utils.joinLine([token] + features + [self.bestTag(tags)])) # ToDo: deal with this except UnicodeDecodeError: pass if out: self.output_queue.put('\n'.join(out))
def readIngredientData(): df = pd.read_csv("nyt-ingredients-snapshot-2015.csv") df = df.fillna("fillna") retArr = [] for index, row in df.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenizeWithoutPunctuation(display_input) del (row["input"]) rowData = [(t, matchUp(t, row)) for t in tokens] tupleData = convertTupleArray(rowData, tokens) retArr.append(tupleData) # ToDo: deal with this except UnicodeDecodeError: pass if index == 5000: break return retArr
def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ df = pd.read_csv(self.opts.data_path) df = df.fillna("") start = int(offset) end = int(offset) + int(count) df_slice = df.iloc[start:end] for index, row in df_slice.iterrows(): prev_tag = None try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del (row["input"]) taggedTokens = [(t, self.matchUp(t, row)) for t in tokens] rowData = self.addPrefixes(taggedTokens) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i + 1, tokens) best_tag = self.bestTag(tags) if best_tag.startswith("I-") and best_tag.split( "-")[-1] != prev_tag.split("-")[-1]: best_tag = best_tag.replace("I-", "B-") print utils.joinLine([token] + features + [best_tag]) prev_tag = best_tag # ToDo: deal with this except UnicodeDecodeError: pass print
def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ data = [] with open(self.opts.data_path, "r") as csvfile: reader = csv.DictReader(csvfile) for line in reader: data.append(line) start = int(offset) end = int(offset) + int(count) data_slice = data[start:end] for row in data_slice: try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del (row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i + 1, tokens) print utils.joinLine([token] + features + [self.bestTag(tags)]) # ToDo: deal with this except UnicodeDecodeError: pass print