Esempio n. 1
0
def token_block_iter(title, html):

    wiki = paragraph_iter(html)

    token_table = map(tokenize, wiki)
    freq = frequency_table(token_table)

    wiki_title = unicode(wiki.title)
    wiki_title_tokens = set(tokenize(wiki_title))

    wiki_index = wiki.id

    # Column normalize frequency table
    freq /= freq.sum(axis=0)

    for para_n, tokens in enumerate(token_table):
        tokens = set(tokens).difference(wiki_title_tokens)
        tokens = list(tokens)

        local_freq = freq[tokens].ix[para_n]
        yield para_n, tokens, wiki_index, wiki_title, local_freq
def token_block_iter(title,html):

    wiki  = paragraph_iter(html)

    token_table = map(tokenize, wiki)
    freq = frequency_table(token_table)

    wiki_title = unicode(wiki.title)
    wiki_title_tokens = set(tokenize(wiki_title))

    wiki_index = wiki.id

    # Column normalize frequency table
    freq /= freq.sum(axis=0)

    for para_n,tokens in enumerate(token_table):
        tokens = set(tokens).difference(wiki_title_tokens)
        tokens = list(tokens)
        
        local_freq = freq[tokens].ix[para_n]
        yield para_n, tokens, wiki_index, wiki_title, local_freq
Esempio n. 3
0
def find_TIL_match(js):
    text = js["wiki"]
    url = js["url"]

    wiki_title = url.split('/')[-1].replace('_',' ').lower()
    wiki_title = wiki_title.split('#')[0]
    wiki_title = unidecode(urllib2.unquote(wiki_title))
    wiki_title_tokens = set(wiki_title.split())
   
    TIL_text = js["title"]
    TIL_tokens = set(tokenize(TIL_text))

    # Remove special tokens from TIL
    TIL_tokens = [x for x in TIL_tokens if len(x)>2 and "TOKEN_" not in x]
    
    paragraphs = list(split_by_paragraph(text))
    tokens = map(tokenize,paragraphs)
    freq = frequency_table(tokens)

    # Find words in TIL used in text
    matching_columns = list(set(freq.columns).intersection(TIL_tokens))

    # Idea: Score each paragraph with the highest ranked match
    df = freq[matching_columns]
    
    # Row normalize, thus unique words count for more!
    df /= df.sum(axis=0)
    df.fillna(0,inplace=True)


    # Find the top scoring paragraph
    score = df.sum(axis=1)
    top_idx = score.argmax()
    match_text = paragraphs[top_idx]

    # Now, normalize off the full frequency table for the entropy weight
    freq /= freq.sum(axis=0)
    freq.fillna(0,inplace=True)
    tokens = list(freq.columns[freq.ix[top_idx]>0])
    weights = freq[tokens].ix[top_idx]

    # Convert them into SQL-able formats
    w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights)))

    d_out = {
        "reddit_idx" : js["name"],
        "TIL"        : TIL_text,
        "unprocessed_wikitext"   : match_text,
        "tokens"   : ' '.join(tokens),
        "url"        : url,
        "score"      : js["score"],
        "weights"    : w_str
    }

    key_order = ["reddit_idx", "TIL",
                 "unprocessed_wikitext", "tokens",
                 "url", "score", "weights"]

    data_match = [d_out[key] for key in key_order]

    # Save the remaining parargraphs
    data_unmatch = []
    
    for n in range(len(paragraphs)):
        if n != top_idx:
            tokens = list(freq.columns[freq.ix[n]>0])
            weights = freq[tokens].ix[n]

            assert(len(tokens)==len(weights))
            if len(tokens)>3:
                # Convert them into SQL-able formats
                w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights)))
                t_str = ' '.join(tokens)            
                data_unmatch.append( [t_str, w_str] )

    return data_match, data_unmatch