def generatedata(path: str, testprop=0, parallel=False): """ Return parsed and formatted sequences X,y to pass to python-crfsuite X is a list of dictionaries containing features for each word y is a list of labels with IOB tags If testprop>0 is specified, split X,y into training and testing sets Return X_train, y_train, X_test, y_test (in that order) """ df = pd.read_csv(path) # Filter entries whose original entry (input) or ingredient name are missing df = df.loc[pd.notna(df.name) & pd.notna(df.input)] if parallel: from pandarallel import pandarallel pandarallel.initialize(verbose=False) df.input = df.input.parallel_apply( lambda line: tokenize(line, preprocess=True)) labels = df.parallel_apply(matchtags, axis=1) ind = np.random.choice([True, False], size=len(labels), p=[1 - testprop, testprop]) features = df.input.parallel_apply(getfeatures) ioblabels = labels.parallel_apply(iobtag) else: df.input = df.input.apply(lambda line: tokenize(line, preprocess=True)) labels = df.apply(matchtags, axis=1) ind = np.random.choice([True, False], size=len(labels), p=[1 - testprop, testprop]) features = df.input.apply(getfeatures) ioblabels = labels.apply(iobtag) X_train = list(chain.from_iterable(features[ind])) y_train = list(chain.from_iterable(ioblabels[ind])) X_test = list(chain.from_iterable(features[np.invert(ind)])) y_test = list(chain.from_iterable(ioblabels[np.invert(ind)])) if testprop == 0: return X_train, y_train return X_train, y_train, X_test, y_test
def TIL_corpus_iter(skip=1): cmd_select = "SELECT unprocessed_wikitext FROM training" cursor = conn_train.execute(cmd_select) for k,(text,) in enumerate(cursor): if k%skip==0: text = unicode(text) text = ' '.join(set(tokenize(text))) yield text.split()
def TIL_corpus_iter(skip=1): cmd_select = "SELECT unprocessed_wikitext FROM training" cursor = conn_train.execute(cmd_select) for k, (text, ) in enumerate(cursor): if k % skip == 0: text = unicode(text) text = ' '.join(set(tokenize(text))) yield text.split()
def getfeatures(line): if type(line) is str: line = tokenize(line, preprocess=True) features = [] comma = False isparenthetical = False for i in range(len(line)): token = line[i] if token == ')': isparenthetical = False token_features = { 'token': token.lower(), 'capitalized': token.istitle(), 'parenthetical': isparenthetical, 'unit': isunit(token), 'numeric': isquantity(token), 'symbol': token in symbols, 'followscomma': comma } if (i == 0): prev_features = {'start': True} else: prv = line[i - 1] prev_features = { '-1token': prv.lower(), '-1capitalized': prv.istitle(), '-1numeric': isquantity(prv), '-1symbol': prv in symbols } if (i == len(line) - 1): next_features = {'end': True} else: nxt = line[i + 1] next_features = { '+1token': nxt.lower(), '+1capitalized': nxt.istitle(), '+1numeric': isquantity(nxt), '+1symbol': nxt in symbols } token_features.update(prev_features) token_features.update(next_features) features.append(token_features) if not isparenthetical and token == ',': comma = not comma if token == '(': isparenthetical = True return features
def matchtags(row): """ Match each token in the input (raw text) to the appropriate label, if one exists - We attempt to match singular and pluralized tokens ("shallot", "shallots") - Matching of fractions and floats are handled (1 1/2, 1.50) - We attemps to match units in alternative representations (tbsp, T, tablespoon) Return list of labels """ ingr_tokens = tokenize(row["name"], preprocess=True) unit_tokens = tokenize(row["unit"], preprocess=True) #comment_tokens = tokenize(row["comment"], preprocess=True) labels = [] for token in row["input"]: if asfloat(token) == row["qty"]: labels.append("QTY") elif round_2f(asfloat(token)) == row["range_end"]: labels.append("QTY-UR") elif any( tokenmatch(standardize(token).lower(), u.lower()) for u in unit_tokens): labels.append("UNIT") elif any(tokenmatch(token.lower(), i.lower()) for i in ingr_tokens): labels.append("INGR") # elif token.lower() in comment_tokens: # labels.append("CMNT") else: labels.append(None) return labels
def token_block_iter(title,html): wiki = paragraph_iter(html) token_table = map(tokenize, wiki) freq = frequency_table(token_table) wiki_title = unicode(wiki.title) wiki_title_tokens = set(tokenize(wiki_title)) wiki_index = wiki.id # Column normalize frequency table freq /= freq.sum(axis=0) for para_n,tokens in enumerate(token_table): tokens = set(tokens).difference(wiki_title_tokens) tokens = list(tokens) local_freq = freq[tokens].ix[para_n] yield para_n, tokens, wiki_index, wiki_title, local_freq
def token_block_iter(title, html): wiki = paragraph_iter(html) token_table = map(tokenize, wiki) freq = frequency_table(token_table) wiki_title = unicode(wiki.title) wiki_title_tokens = set(tokenize(wiki_title)) wiki_index = wiki.id # Column normalize frequency table freq /= freq.sum(axis=0) for para_n, tokens in enumerate(token_table): tokens = set(tokens).difference(wiki_title_tokens) tokens = list(tokens) local_freq = freq[tokens].ix[para_n] yield para_n, tokens, wiki_index, wiki_title, local_freq
def setUp(self): expression = '(abc)' self.expression_tokenized = tokenize(expression)
base_url = "http://en.wikipedia.org/w/index.php?title={}" try: url = base_url.format(urllib.quote(title)) print u"# [{:s}]({:s})".format(title,url) print text.encode('utf-8') print except: pass for item in conn_decoy.execute(cmd_select_positive_scores): idx,wiki_idx,title,para_n = item html = conn_wiki.execute(cmd_select_wiki, (title,)).next()[0] paragraphs = list(paragraph_iter(html)) title = unicode(title) if paragraphs: print title text = paragraphs[para_n] tokens = u' '.join(tokenize(text)) conn_report.execute(cmd_insert_into_report, (title, text, tokens)) conn_report.commit()
cmd_select_wiki = '''SELECT text FROM wiki WHERE title=?''' def pprint_item(title, text): base_url = "http://en.wikipedia.org/w/index.php?title={}" try: url = base_url.format(urllib.quote(title)) print u"# [{:s}]({:s})".format(title, url) print text.encode('utf-8') print except: pass for item in conn_decoy.execute(cmd_select_positive_scores): idx, wiki_idx, title, para_n = item html = conn_wiki.execute(cmd_select_wiki, (title, )).next()[0] paragraphs = list(paragraph_iter(html)) title = unicode(title) if paragraphs: print title text = paragraphs[para_n] tokens = u' '.join(tokenize(text)) conn_report.execute(cmd_insert_into_report, (title, text, tokens)) conn_report.commit()
def find_TIL_match(js): text = js["wiki"] url = js["url"] wiki_title = url.split('/')[-1].replace('_',' ').lower() wiki_title = wiki_title.split('#')[0] wiki_title = unidecode(urllib2.unquote(wiki_title)) wiki_title_tokens = set(wiki_title.split()) TIL_text = js["title"] TIL_tokens = set(tokenize(TIL_text)) # Remove special tokens from TIL TIL_tokens = [x for x in TIL_tokens if len(x)>2 and "TOKEN_" not in x] paragraphs = list(split_by_paragraph(text)) tokens = map(tokenize,paragraphs) freq = frequency_table(tokens) # Find words in TIL used in text matching_columns = list(set(freq.columns).intersection(TIL_tokens)) # Idea: Score each paragraph with the highest ranked match df = freq[matching_columns] # Row normalize, thus unique words count for more! df /= df.sum(axis=0) df.fillna(0,inplace=True) # Find the top scoring paragraph score = df.sum(axis=1) top_idx = score.argmax() match_text = paragraphs[top_idx] # Now, normalize off the full frequency table for the entropy weight freq /= freq.sum(axis=0) freq.fillna(0,inplace=True) tokens = list(freq.columns[freq.ix[top_idx]>0]) weights = freq[tokens].ix[top_idx] # Convert them into SQL-able formats w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights))) d_out = { "reddit_idx" : js["name"], "TIL" : TIL_text, "unprocessed_wikitext" : match_text, "tokens" : ' '.join(tokens), "url" : url, "score" : js["score"], "weights" : w_str } key_order = ["reddit_idx", "TIL", "unprocessed_wikitext", "tokens", "url", "score", "weights"] data_match = [d_out[key] for key in key_order] # Save the remaining parargraphs data_unmatch = [] for n in range(len(paragraphs)): if n != top_idx: tokens = list(freq.columns[freq.ix[n]>0]) weights = freq[tokens].ix[n] assert(len(tokens)==len(weights)) if len(tokens)>3: # Convert them into SQL-able formats w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights))) t_str = ' '.join(tokens) data_unmatch.append( [t_str, w_str] ) return data_match, data_unmatch