def token_block_iter(title, html): wiki = paragraph_iter(html) token_table = map(tokenize, wiki) freq = frequency_table(token_table) wiki_title = unicode(wiki.title) wiki_title_tokens = set(tokenize(wiki_title)) wiki_index = wiki.id # Column normalize frequency table freq /= freq.sum(axis=0) for para_n, tokens in enumerate(token_table): tokens = set(tokens).difference(wiki_title_tokens) tokens = list(tokens) local_freq = freq[tokens].ix[para_n] yield para_n, tokens, wiki_index, wiki_title, local_freq
def token_block_iter(title,html): wiki = paragraph_iter(html) token_table = map(tokenize, wiki) freq = frequency_table(token_table) wiki_title = unicode(wiki.title) wiki_title_tokens = set(tokenize(wiki_title)) wiki_index = wiki.id # Column normalize frequency table freq /= freq.sum(axis=0) for para_n,tokens in enumerate(token_table): tokens = set(tokens).difference(wiki_title_tokens) tokens = list(tokens) local_freq = freq[tokens].ix[para_n] yield para_n, tokens, wiki_index, wiki_title, local_freq
def find_TIL_match(js): text = js["wiki"] url = js["url"] wiki_title = url.split('/')[-1].replace('_',' ').lower() wiki_title = wiki_title.split('#')[0] wiki_title = unidecode(urllib2.unquote(wiki_title)) wiki_title_tokens = set(wiki_title.split()) TIL_text = js["title"] TIL_tokens = set(tokenize(TIL_text)) # Remove special tokens from TIL TIL_tokens = [x for x in TIL_tokens if len(x)>2 and "TOKEN_" not in x] paragraphs = list(split_by_paragraph(text)) tokens = map(tokenize,paragraphs) freq = frequency_table(tokens) # Find words in TIL used in text matching_columns = list(set(freq.columns).intersection(TIL_tokens)) # Idea: Score each paragraph with the highest ranked match df = freq[matching_columns] # Row normalize, thus unique words count for more! df /= df.sum(axis=0) df.fillna(0,inplace=True) # Find the top scoring paragraph score = df.sum(axis=1) top_idx = score.argmax() match_text = paragraphs[top_idx] # Now, normalize off the full frequency table for the entropy weight freq /= freq.sum(axis=0) freq.fillna(0,inplace=True) tokens = list(freq.columns[freq.ix[top_idx]>0]) weights = freq[tokens].ix[top_idx] # Convert them into SQL-able formats w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights))) d_out = { "reddit_idx" : js["name"], "TIL" : TIL_text, "unprocessed_wikitext" : match_text, "tokens" : ' '.join(tokens), "url" : url, "score" : js["score"], "weights" : w_str } key_order = ["reddit_idx", "TIL", "unprocessed_wikitext", "tokens", "url", "score", "weights"] data_match = [d_out[key] for key in key_order] # Save the remaining parargraphs data_unmatch = [] for n in range(len(paragraphs)): if n != top_idx: tokens = list(freq.columns[freq.ix[n]>0]) weights = freq[tokens].ix[n] assert(len(tokens)==len(weights)) if len(tokens)>3: # Convert them into SQL-able formats w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights))) t_str = ' '.join(tokens) data_unmatch.append( [t_str, w_str] ) return data_match, data_unmatch